mirror of
https://github.com/sigp/lighthouse.git
synced 2026-03-15 19:02:42 +00:00
Implement reliable range sync for PeerDAS
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
use beacon_chain::block_verification_types::RpcBlock;
|
||||
use itertools::Itertools;
|
||||
use lighthouse_network::rpc::methods::BlocksByRangeRequest;
|
||||
use lighthouse_network::service::api_types::Id;
|
||||
use lighthouse_network::PeerId;
|
||||
@@ -17,15 +18,7 @@ const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 5;
|
||||
/// after `MAX_BATCH_PROCESSING_ATTEMPTS` times, it is considered faulty.
|
||||
const MAX_BATCH_PROCESSING_ATTEMPTS: u8 = 3;
|
||||
|
||||
/// Type of expected batch.
|
||||
#[derive(Debug, Copy, Clone, Display)]
|
||||
#[strum(serialize_all = "snake_case")]
|
||||
pub enum ByRangeRequestType {
|
||||
BlocksAndColumns,
|
||||
BlocksAndBlobs,
|
||||
Blocks,
|
||||
}
|
||||
|
||||
// TODO(das): Consider merging with PeerGroup
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct BatchPeers {
|
||||
block_peer: PeerId,
|
||||
@@ -53,6 +46,12 @@ impl BatchPeers {
|
||||
pub fn column(&self, index: &ColumnIndex) -> Option<&PeerId> {
|
||||
self.column_peers.get(index)
|
||||
}
|
||||
|
||||
pub fn iter_unique_peers(&self) -> impl Iterator<Item = &PeerId> {
|
||||
std::iter::once(&self.block_peer)
|
||||
.chain(self.column_peers.values())
|
||||
.unique()
|
||||
}
|
||||
}
|
||||
|
||||
/// Allows customisation of the above constants used in other sync methods such as BackFillSync.
|
||||
|
||||
@@ -10,7 +10,7 @@ use itertools::Itertools;
|
||||
use lighthouse_network::service::api_types::Id;
|
||||
use lighthouse_network::{PeerAction, PeerId};
|
||||
use logging::crit;
|
||||
use std::collections::{btree_map::Entry, BTreeMap, HashSet};
|
||||
use std::collections::{btree_map::Entry, BTreeMap, HashMap, HashSet};
|
||||
use strum::IntoStaticStr;
|
||||
use tracing::{debug, instrument, warn};
|
||||
use types::{Epoch, EthSpec, Hash256, Slot};
|
||||
@@ -87,9 +87,11 @@ pub struct SyncingChain<T: BeaconChainTypes> {
|
||||
batches: BTreeMap<BatchId, BatchInfo<T::EthSpec>>,
|
||||
|
||||
/// The peers that agree on the `target_head_slot` and `target_head_root` as a canonical chain
|
||||
/// and thus available to download this chain from, as well as the batches we are currently
|
||||
/// requesting.
|
||||
peers: HashSet<PeerId>,
|
||||
/// and thus available to download this chain from.
|
||||
///
|
||||
/// Also, For each peer tracks the total requests done per peer as part of this SyncingChain
|
||||
/// `HashMap<peer, total_requests_per_peer>`
|
||||
peers: HashMap<PeerId, usize>,
|
||||
|
||||
/// Starting epoch of the next batch that needs to be downloaded.
|
||||
to_be_downloaded: BatchId,
|
||||
@@ -121,7 +123,40 @@ pub enum ChainSyncingState {
|
||||
Syncing,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub enum BatchStateSummary {
|
||||
Downloading,
|
||||
Processing,
|
||||
AwaitingProcessing,
|
||||
AwaitingValidation,
|
||||
Unexpected(&'static str),
|
||||
}
|
||||
|
||||
impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
/// Returns a summary of batch states for assertions in tests.
|
||||
#[cfg(test)]
|
||||
pub fn batches_state(&self) -> Vec<(BatchId, BatchStateSummary)> {
|
||||
self.batches
|
||||
.iter()
|
||||
.map(|(id, batch)| {
|
||||
let state = match batch.state() {
|
||||
// A batch is never left in this state, it's only the initial value
|
||||
BatchState::AwaitingDownload => {
|
||||
BatchStateSummary::Unexpected("AwaitingDownload")
|
||||
}
|
||||
BatchState::Downloading { .. } => BatchStateSummary::Downloading,
|
||||
BatchState::AwaitingProcessing { .. } => BatchStateSummary::AwaitingProcessing,
|
||||
BatchState::Poisoned => BatchStateSummary::Unexpected("Poisoned"),
|
||||
BatchState::Processing { .. } => BatchStateSummary::Processing,
|
||||
BatchState::Failed => BatchStateSummary::Unexpected("Failed"),
|
||||
BatchState::AwaitingValidation { .. } => BatchStateSummary::AwaitingValidation,
|
||||
};
|
||||
(*id, state)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn new(
|
||||
id: Id,
|
||||
@@ -138,7 +173,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
target_head_slot,
|
||||
target_head_root,
|
||||
batches: BTreeMap::new(),
|
||||
peers: HashSet::from_iter([peer_id]),
|
||||
peers: HashMap::from_iter([(peer_id, <_>::default())]),
|
||||
to_be_downloaded: start_epoch,
|
||||
processing_target: start_epoch,
|
||||
optimistic_start: None,
|
||||
@@ -168,7 +203,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
/// Peers currently syncing this chain.
|
||||
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
pub fn peers(&self) -> impl Iterator<Item = PeerId> + '_ {
|
||||
self.peers.iter().cloned()
|
||||
self.peers.keys().cloned()
|
||||
}
|
||||
|
||||
/// Progress in epochs made by the chain
|
||||
@@ -221,6 +256,12 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
request_id: Id,
|
||||
blocks: Vec<RpcBlock<T::EthSpec>>,
|
||||
) -> ProcessingResult {
|
||||
// Account for one more requests to this peer
|
||||
// TODO(das): this code assumes that we do a single request per peer per RpcBlock
|
||||
for peer in batch_peers.iter_unique_peers() {
|
||||
*self.peers.entry(*peer).or_default() += 1;
|
||||
}
|
||||
|
||||
// check if we have this batch
|
||||
let batch = match self.batches.get_mut(&batch_id) {
|
||||
None => {
|
||||
@@ -400,11 +441,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
self.request_batches(network)?;
|
||||
}
|
||||
}
|
||||
} else if !self.good_peers_on_sampling_subnets(self.processing_target, network) {
|
||||
// This is to handle the case where no batch was sent for the current processing
|
||||
// target when there is no sampling peers available. This is a valid state and should not
|
||||
// return an error.
|
||||
return Ok(KeepChain);
|
||||
} else {
|
||||
return Err(RemoveChain::WrongChainState(format!(
|
||||
"Batch not found for current processing target {}",
|
||||
@@ -577,7 +613,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
"Batch failed to download. Dropping chain scoring peers"
|
||||
);
|
||||
|
||||
for peer in self.peers.drain() {
|
||||
for (peer, _) in self.peers.drain() {
|
||||
network.report_peer(peer, penalty, "faulty_chain");
|
||||
}
|
||||
Err(RemoveChain::ChainFailed {
|
||||
@@ -842,7 +878,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
peer_id: PeerId,
|
||||
) -> ProcessingResult {
|
||||
self.peers.insert(peer_id);
|
||||
self.peers.insert(peer_id, <_>::default());
|
||||
self.request_batches(network)
|
||||
}
|
||||
|
||||
@@ -854,7 +890,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
batch_id: BatchId,
|
||||
peer_id: &PeerId,
|
||||
request_id: Id,
|
||||
err: RpcResponseError,
|
||||
) -> ProcessingResult {
|
||||
@@ -869,7 +904,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
debug!(
|
||||
batch_epoch = %batch_id,
|
||||
batch_state = ?batch.state(),
|
||||
%peer_id,
|
||||
%request_id,
|
||||
?batch_state,
|
||||
"Batch not expecting block"
|
||||
@@ -880,12 +914,13 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
batch_epoch = %batch_id,
|
||||
batch_state = ?batch.state(),
|
||||
error = ?err,
|
||||
%peer_id,
|
||||
%request_id,
|
||||
"Batch download error"
|
||||
);
|
||||
if let BatchOperationOutcome::Failed { blacklist } =
|
||||
batch.download_failed(Some(*peer_id))?
|
||||
// TODO(das): Is it necessary for the batch to track failed peers? Can we make this
|
||||
// mechanism compatible with PeerDAS and before PeerDAS?
|
||||
batch.download_failed(None)?
|
||||
{
|
||||
return Err(RemoveChain::ChainFailed {
|
||||
blacklist,
|
||||
@@ -896,7 +931,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
} else {
|
||||
debug!(
|
||||
batch_epoch = %batch_id,
|
||||
%peer_id,
|
||||
%request_id,
|
||||
batch_state,
|
||||
"Batch not found"
|
||||
@@ -937,6 +971,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
},
|
||||
&synced_peers,
|
||||
&failed_peers,
|
||||
&self.peers,
|
||||
) {
|
||||
Ok(request_id) => {
|
||||
// inform the batch about the new request
|
||||
@@ -953,14 +988,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
return Ok(KeepChain);
|
||||
}
|
||||
Err(e) => match e {
|
||||
// TODO(das): Handle the NoPeer case explicitly and don't drop the batch. For
|
||||
// sync to work properly it must be okay to have "stalled" batches in
|
||||
// AwaitingDownload state. Currently it will error with invalid state if
|
||||
// that happens. Sync manager must periodicatlly prune stalled batches like
|
||||
// we do for lookup sync. Then we can deprecate the redundant
|
||||
// `good_peers_on_sampling_subnets` checks.
|
||||
e
|
||||
@ (RpcRequestSendError::NoPeer(_) | RpcRequestSendError::InternalError(_)) => {
|
||||
RpcRequestSendError::InternalError(e) => {
|
||||
// NOTE: under normal conditions this shouldn't happen but we handle it anyway
|
||||
warn!(%batch_id, error = ?e, "batch_id" = %batch_id, %batch, "Could not send batch request");
|
||||
// register the failed download and check if the batch can be retried
|
||||
@@ -1019,11 +1047,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
// check if we have the batch for our optimistic start. If not, request it first.
|
||||
// We wait for this batch before requesting any other batches.
|
||||
if let Some(epoch) = self.optimistic_start {
|
||||
if !self.good_peers_on_sampling_subnets(epoch, network) {
|
||||
debug!("Waiting for peers to be available on sampling column subnets");
|
||||
return Ok(KeepChain);
|
||||
}
|
||||
|
||||
if let Entry::Vacant(entry) = self.batches.entry(epoch) {
|
||||
let optimistic_batch = BatchInfo::new(&epoch, EPOCHS_PER_BATCH);
|
||||
entry.insert(optimistic_batch);
|
||||
@@ -1046,35 +1069,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
Ok(KeepChain)
|
||||
}
|
||||
|
||||
/// Checks all sampling column subnets for peers. Returns `true` if there is at least one peer in
|
||||
/// every sampling column subnet.
|
||||
fn good_peers_on_sampling_subnets(
|
||||
&self,
|
||||
epoch: Epoch,
|
||||
network: &SyncNetworkContext<T>,
|
||||
) -> bool {
|
||||
if network.chain.spec.is_peer_das_enabled_for_epoch(epoch) {
|
||||
// Require peers on all sampling column subnets before sending batches
|
||||
let peers_on_all_custody_subnets = network
|
||||
.network_globals()
|
||||
.sampling_subnets
|
||||
.iter()
|
||||
.all(|subnet_id| {
|
||||
let peer_count = network
|
||||
.network_globals()
|
||||
.peers
|
||||
.read()
|
||||
.good_custody_subnet_peer(*subnet_id)
|
||||
.count();
|
||||
|
||||
peer_count > 0
|
||||
});
|
||||
peers_on_all_custody_subnets
|
||||
} else {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates the next required batch from the chain. If there are no more batches required,
|
||||
/// `false` is returned.
|
||||
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
@@ -1107,15 +1101,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
return None;
|
||||
}
|
||||
|
||||
// don't send batch requests until we have peers on sampling subnets
|
||||
// TODO(das): this is a workaround to avoid sending out excessive block requests because
|
||||
// block and data column requests are currently coupled. This can be removed once we find a
|
||||
// way to decouple the requests and do retries individually, see issue #6258.
|
||||
if !self.good_peers_on_sampling_subnets(self.to_be_downloaded, network) {
|
||||
debug!("Waiting for peers to be available on custody column subnets");
|
||||
return None;
|
||||
}
|
||||
|
||||
// If no batch needs a retry, attempt to send the batch of the next epoch to download
|
||||
let next_batch_id = self.to_be_downloaded;
|
||||
// this batch could have been included already being an optimistic batch
|
||||
|
||||
@@ -54,6 +54,13 @@ pub struct ChainCollection<T: BeaconChainTypes> {
|
||||
}
|
||||
|
||||
impl<T: BeaconChainTypes> ChainCollection<T> {
|
||||
#[cfg(test)]
|
||||
pub(crate) fn iter(&self) -> impl Iterator<Item = &SyncingChain<T>> {
|
||||
self.finalized_chains
|
||||
.values()
|
||||
.chain(self.head_chains.values())
|
||||
}
|
||||
|
||||
pub fn new(beacon_chain: Arc<BeaconChain<T>>) -> Self {
|
||||
ChainCollection {
|
||||
beacon_chain,
|
||||
|
||||
@@ -9,10 +9,9 @@ mod sync_type;
|
||||
|
||||
pub use batch::{
|
||||
BatchConfig, BatchInfo, BatchOperationOutcome, BatchPeers, BatchProcessingResult, BatchState,
|
||||
ByRangeRequestType,
|
||||
};
|
||||
pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH};
|
||||
#[cfg(test)]
|
||||
pub use chain_collection::SyncChainStatus;
|
||||
pub use chain::BatchStateSummary;
|
||||
pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH};
|
||||
pub use range::RangeSync;
|
||||
pub use sync_type::RangeSyncType;
|
||||
|
||||
@@ -39,6 +39,8 @@
|
||||
//! Each chain is downloaded in batches of blocks. The batched blocks are processed sequentially
|
||||
//! and further batches are requested as current blocks are being processed.
|
||||
|
||||
#[cfg(test)]
|
||||
use super::chain::BatchStateSummary;
|
||||
use super::chain::{BatchId, ChainId, RemoveChain, SyncingChain};
|
||||
use super::chain_collection::{ChainCollection, SyncChainStatus};
|
||||
use super::sync_type::RangeSyncType;
|
||||
@@ -100,10 +102,23 @@ where
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn __failed_chains(&mut self) -> Vec<Hash256> {
|
||||
pub(crate) fn failed_chains(&mut self) -> Vec<Hash256> {
|
||||
self.failed_chains.keys().copied().collect()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn batches_state(&self) -> Vec<(ChainId, BatchId, BatchStateSummary)> {
|
||||
self.chains
|
||||
.iter()
|
||||
.flat_map(|chain| {
|
||||
chain
|
||||
.batches_state()
|
||||
.into_iter()
|
||||
.map(|(batch_id, state)| (chain.id(), batch_id, state))
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(component = "range_sync"),
|
||||
@@ -344,7 +359,6 @@ where
|
||||
pub fn inject_error(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
peer_id: PeerId,
|
||||
batch_id: BatchId,
|
||||
chain_id: ChainId,
|
||||
request_id: Id,
|
||||
@@ -352,7 +366,7 @@ where
|
||||
) {
|
||||
// check that this request is pending
|
||||
match self.chains.call_by_id(chain_id, |chain| {
|
||||
chain.inject_error(network, batch_id, &peer_id, request_id, err)
|
||||
chain.inject_error(network, batch_id, request_id, err)
|
||||
}) {
|
||||
Ok((removed_chain, sync_type)) => {
|
||||
if let Some((removed_chain, remove_reason)) = removed_chain {
|
||||
|
||||
Reference in New Issue
Block a user