mirror of
https://github.com/sigp/lighthouse.git
synced 2026-05-08 01:05:47 +00:00
Resolve more TODOs
This commit is contained in:
@@ -21,8 +21,9 @@ use beacon_chain::{BeaconChain, BeaconChainTypes};
|
|||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use lighthouse_network::service::api_types::Id;
|
use lighthouse_network::service::api_types::Id;
|
||||||
use lighthouse_network::types::{BackFillState, NetworkGlobals};
|
use lighthouse_network::types::{BackFillState, NetworkGlobals};
|
||||||
use lighthouse_network::PeerAction;
|
use lighthouse_network::{PeerAction, PeerId};
|
||||||
use logging::crit;
|
use logging::crit;
|
||||||
|
use parking_lot::RwLock;
|
||||||
use std::collections::{
|
use std::collections::{
|
||||||
btree_map::{BTreeMap, Entry},
|
btree_map::{BTreeMap, Entry},
|
||||||
HashMap, HashSet,
|
HashMap, HashSet,
|
||||||
@@ -135,6 +136,8 @@ pub struct BackFillSync<T: BeaconChainTypes> {
|
|||||||
/// This signifies that we are able to attempt to restart a failed chain.
|
/// This signifies that we are able to attempt to restart a failed chain.
|
||||||
restart_failed_sync: bool,
|
restart_failed_sync: bool,
|
||||||
|
|
||||||
|
peers: Arc<RwLock<HashSet<PeerId>>>,
|
||||||
|
|
||||||
/// Reference to the beacon chain to obtain initial starting points for the backfill sync.
|
/// Reference to the beacon chain to obtain initial starting points for the backfill sync.
|
||||||
beacon_chain: Arc<BeaconChain<T>>,
|
beacon_chain: Arc<BeaconChain<T>>,
|
||||||
|
|
||||||
@@ -179,6 +182,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
|||||||
current_processing_batch: None,
|
current_processing_batch: None,
|
||||||
validated_batches: 0,
|
validated_batches: 0,
|
||||||
restart_failed_sync: false,
|
restart_failed_sync: false,
|
||||||
|
peers: <_>::default(),
|
||||||
beacon_chain,
|
beacon_chain,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -218,14 +222,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
|||||||
match self.state() {
|
match self.state() {
|
||||||
BackFillState::Syncing => {} // already syncing ignore.
|
BackFillState::Syncing => {} // already syncing ignore.
|
||||||
BackFillState::Paused => {
|
BackFillState::Paused => {
|
||||||
if self
|
if !self.peers.read().is_empty() {
|
||||||
.network_globals
|
|
||||||
.peers
|
|
||||||
.read()
|
|
||||||
.synced_peers()
|
|
||||||
.next()
|
|
||||||
.is_some()
|
|
||||||
{
|
|
||||||
// If there are peers to resume with, begin the resume.
|
// If there are peers to resume with, begin the resume.
|
||||||
debug!(start_epoch = ?self.current_start, awaiting_batches = self.batches.len(), processing_target = ?self.processing_target, "Resuming backfill sync");
|
debug!(start_epoch = ?self.current_start, awaiting_batches = self.batches.len(), processing_target = ?self.processing_target, "Resuming backfill sync");
|
||||||
self.set_state(BackFillState::Syncing);
|
self.set_state(BackFillState::Syncing);
|
||||||
@@ -298,6 +295,14 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn add_peer(&mut self, peer_id: PeerId) {
|
||||||
|
self.peers.write().insert(peer_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn peer_disconnected(&mut self, peer_id: &PeerId) {
|
||||||
|
self.peers.write().remove(peer_id);
|
||||||
|
}
|
||||||
|
|
||||||
/// An RPC error has occurred.
|
/// An RPC error has occurred.
|
||||||
///
|
///
|
||||||
/// If the batch exists it is re-requested.
|
/// If the batch exists it is re-requested.
|
||||||
@@ -920,20 +925,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
|||||||
batch_id: BatchId,
|
batch_id: BatchId,
|
||||||
) -> Result<(), BackFillError> {
|
) -> Result<(), BackFillError> {
|
||||||
if let Some(batch) = self.batches.get_mut(&batch_id) {
|
if let Some(batch) = self.batches.get_mut(&batch_id) {
|
||||||
let synced_peers = self
|
|
||||||
.network_globals
|
|
||||||
.peers
|
|
||||||
.read()
|
|
||||||
.synced_peers()
|
|
||||||
.cloned()
|
|
||||||
.collect::<HashSet<_>>();
|
|
||||||
|
|
||||||
let request = batch.to_blocks_by_range_request();
|
let request = batch.to_blocks_by_range_request();
|
||||||
let failed_peers = batch.failed_block_peers();
|
let failed_peers = batch.failed_block_peers();
|
||||||
match network.block_components_by_range_request(
|
match network.block_components_by_range_request(
|
||||||
request,
|
request,
|
||||||
RangeRequestId::BackfillSync { batch_id },
|
RangeRequestId::BackfillSync { batch_id },
|
||||||
&synced_peers,
|
self.peers.clone(),
|
||||||
&failed_peers,
|
&failed_peers,
|
||||||
// Does not track total requests per peers for now
|
// Does not track total requests per peers for now
|
||||||
&HashMap::new(),
|
&HashMap::new(),
|
||||||
|
|||||||
@@ -413,6 +413,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
|||||||
PeerSyncType::Advanced => {
|
PeerSyncType::Advanced => {
|
||||||
self.range_sync
|
self.range_sync
|
||||||
.add_peer(&mut self.network, local, peer_id, remote);
|
.add_peer(&mut self.network, local, peer_id, remote);
|
||||||
|
self.backfill_sync.add_peer(peer_id);
|
||||||
}
|
}
|
||||||
PeerSyncType::FullySynced => {
|
PeerSyncType::FullySynced => {
|
||||||
// Sync considers this peer close enough to the head to not trigger range sync.
|
// Sync considers this peer close enough to the head to not trigger range sync.
|
||||||
@@ -530,6 +531,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
|||||||
|
|
||||||
// Remove peer from all data structures
|
// Remove peer from all data structures
|
||||||
self.range_sync.peer_disconnect(&mut self.network, peer_id);
|
self.range_sync.peer_disconnect(&mut self.network, peer_id);
|
||||||
|
self.backfill_sync.peer_disconnected(peer_id);
|
||||||
self.block_lookups.peer_disconnected(peer_id);
|
self.block_lookups.peer_disconnected(peer_id);
|
||||||
|
|
||||||
// Regardless of the outcome, we update the sync status.
|
// Regardless of the outcome, we update the sync status.
|
||||||
|
|||||||
@@ -478,7 +478,7 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
|||||||
&mut self,
|
&mut self,
|
||||||
request: BlocksByRangeRequest,
|
request: BlocksByRangeRequest,
|
||||||
requester: RangeRequestId,
|
requester: RangeRequestId,
|
||||||
peers: &HashSet<PeerId>,
|
peers: Arc<RwLock<HashSet<PeerId>>>,
|
||||||
peers_to_deprioritize: &HashSet<PeerId>,
|
peers_to_deprioritize: &HashSet<PeerId>,
|
||||||
total_requests_per_peer: &HashMap<PeerId, usize>,
|
total_requests_per_peer: &HashMap<PeerId, usize>,
|
||||||
) -> Result<Id, RpcRequestSendError> {
|
) -> Result<Id, RpcRequestSendError> {
|
||||||
@@ -498,7 +498,6 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
|||||||
|
|
||||||
self.block_components_by_range_requests.insert(id, req);
|
self.block_components_by_range_requests.insert(id, req);
|
||||||
|
|
||||||
// TODO: use ID
|
|
||||||
Ok(id.id)
|
Ok(id.id)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -91,7 +91,7 @@ impl From<Error> for RpcRequestSendError {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// FOR TESTING ONLY
|
/// Used to typesafe assertions of state in range sync tests
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub enum BlockComponentsByRangeRequestStep {
|
pub enum BlockComponentsByRangeRequestStep {
|
||||||
@@ -103,7 +103,7 @@ impl<T: BeaconChainTypes> BlockComponentsByRangeRequest<T> {
|
|||||||
pub fn new(
|
pub fn new(
|
||||||
id: ComponentsByRangeRequestId,
|
id: ComponentsByRangeRequestId,
|
||||||
request: BlocksByRangeRequest,
|
request: BlocksByRangeRequest,
|
||||||
peers: &HashSet<PeerId>,
|
peers: Arc<RwLock<HashSet<PeerId>>>,
|
||||||
peers_to_deprioritize: &HashSet<PeerId>,
|
peers_to_deprioritize: &HashSet<PeerId>,
|
||||||
total_requests_per_peer: &HashMap<PeerId, usize>,
|
total_requests_per_peer: &HashMap<PeerId, usize>,
|
||||||
cx: &mut SyncNetworkContext<T>,
|
cx: &mut SyncNetworkContext<T>,
|
||||||
@@ -123,6 +123,7 @@ impl<T: BeaconChainTypes> BlockComponentsByRangeRequest<T> {
|
|||||||
// will request all blocks for the first 5 epochs to that same single peer. Before we would
|
// will request all blocks for the first 5 epochs to that same single peer. Before we would
|
||||||
// query only idle peers in the syncing chain.
|
// query only idle peers in the syncing chain.
|
||||||
let Some(block_peer) = peers
|
let Some(block_peer) = peers
|
||||||
|
.read()
|
||||||
.iter()
|
.iter()
|
||||||
.map(|peer| {
|
.map(|peer| {
|
||||||
(
|
(
|
||||||
@@ -180,9 +181,7 @@ impl<T: BeaconChainTypes> BlockComponentsByRangeRequest<T> {
|
|||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
id,
|
id,
|
||||||
// TODO(das): share the rwlock with the range sync batch. Are peers added to the batch
|
peers,
|
||||||
// after being created?
|
|
||||||
peers: Arc::new(RwLock::new(peers.clone())),
|
|
||||||
request,
|
request,
|
||||||
state,
|
state,
|
||||||
})
|
})
|
||||||
@@ -511,8 +510,6 @@ fn couple_blocks_fulu<E: EthSpec>(
|
|||||||
.remove(&block_root)
|
.remove(&block_root)
|
||||||
.unwrap_or_default();
|
.unwrap_or_default();
|
||||||
|
|
||||||
// TODO(das): Change RpcBlock to holding a Vec of DataColumnSidecars so we don't need
|
|
||||||
// the spec here.
|
|
||||||
RpcBlock::new_with_custody_columns(
|
RpcBlock::new_with_custody_columns(
|
||||||
Some(block_root),
|
Some(block_root),
|
||||||
block,
|
block,
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
use super::custody_by_root::{ColumnRequest, Error};
|
use super::custody_by_root::{ColumnRequest, Error};
|
||||||
use crate::sync::network_context::RpcResponseError;
|
|
||||||
use beacon_chain::validator_monitor::timestamp_now;
|
use beacon_chain::validator_monitor::timestamp_now;
|
||||||
use beacon_chain::BeaconChainTypes;
|
use beacon_chain::BeaconChainTypes;
|
||||||
use fnv::FnvHashMap;
|
use fnv::FnvHashMap;
|
||||||
@@ -22,7 +21,7 @@ use types::{
|
|||||||
|
|
||||||
use super::{PeerGroup, RpcResponseResult, SyncNetworkContext};
|
use super::{PeerGroup, RpcResponseResult, SyncNetworkContext};
|
||||||
|
|
||||||
const TEMPORARY_FAULT_EXPIRY_SECONDS: u64 = 15;
|
const FAILED_PEERS_EXPIRY_SECONDS: u64 = 15;
|
||||||
const REQUEST_EXPIRY_SECONDS: u64 = 300;
|
const REQUEST_EXPIRY_SECONDS: u64 = 300;
|
||||||
|
|
||||||
pub struct ActiveCustodyByRangeRequest<T: BeaconChainTypes> {
|
pub struct ActiveCustodyByRangeRequest<T: BeaconChainTypes> {
|
||||||
@@ -41,13 +40,7 @@ pub struct ActiveCustodyByRangeRequest<T: BeaconChainTypes> {
|
|||||||
FnvHashMap<DataColumnsByRangeRequestId, ActiveBatchColumnsRequest>,
|
FnvHashMap<DataColumnsByRangeRequestId, ActiveBatchColumnsRequest>,
|
||||||
/// Peers that have recently failed to successfully respond to a columns by root request.
|
/// Peers that have recently failed to successfully respond to a columns by root request.
|
||||||
/// Having a LRUTimeCache allows this request to not have to track disconnecting peers.
|
/// Having a LRUTimeCache allows this request to not have to track disconnecting peers.
|
||||||
peers_with_custody_failures: LRUTimeCache<PeerId>,
|
failed_peers: LRUTimeCache<PeerId>,
|
||||||
peers_with_temporary_faults: LRUTimeCache<PeerId>,
|
|
||||||
// TODO(das): does this HashSet has an OOM risk? We should either: make sure that this request
|
|
||||||
// structs are dropped after some time, that disconnected peers are pruned (but we may want to
|
|
||||||
// retain faulty information if they just disconnect and reconnect) or make this an LRUTimeCache
|
|
||||||
// with a long time (like 5 minutes).
|
|
||||||
peers_with_permanent_faults: HashSet<PeerId>,
|
|
||||||
/// Set of peers that claim to have imported this block and their custody columns
|
/// Set of peers that claim to have imported this block and their custody columns
|
||||||
lookup_peers: Arc<RwLock<HashSet<PeerId>>>,
|
lookup_peers: Arc<RwLock<HashSet<PeerId>>>,
|
||||||
|
|
||||||
@@ -89,13 +82,7 @@ impl<T: BeaconChainTypes> ActiveCustodyByRangeRequest<T> {
|
|||||||
.map(|index| (*index, ColumnRequest::new())),
|
.map(|index| (*index, ColumnRequest::new())),
|
||||||
),
|
),
|
||||||
active_batch_columns_requests: <_>::default(),
|
active_batch_columns_requests: <_>::default(),
|
||||||
peers_with_custody_failures: LRUTimeCache::new(Duration::from_secs(
|
failed_peers: LRUTimeCache::new(Duration::from_secs(FAILED_PEERS_EXPIRY_SECONDS)),
|
||||||
TEMPORARY_FAULT_EXPIRY_SECONDS,
|
|
||||||
)),
|
|
||||||
peers_with_temporary_faults: LRUTimeCache::new(Duration::from_secs(
|
|
||||||
TEMPORARY_FAULT_EXPIRY_SECONDS,
|
|
||||||
)),
|
|
||||||
peers_with_permanent_faults: HashSet::new(),
|
|
||||||
lookup_peers,
|
lookup_peers,
|
||||||
_phantom: PhantomData,
|
_phantom: PhantomData,
|
||||||
}
|
}
|
||||||
@@ -138,7 +125,7 @@ impl<T: BeaconChainTypes> ActiveCustodyByRangeRequest<T> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Accumulate columns that the peer does not have to issue a single log per request
|
// Accumulate columns that the peer does not have to issue a single log per request
|
||||||
let mut missing_column_indexes = vec![];
|
let mut missing_column_indices = vec![];
|
||||||
let mut incorrect_column_indices = vec![];
|
let mut incorrect_column_indices = vec![];
|
||||||
let mut imported_column_indices = vec![];
|
let mut imported_column_indices = vec![];
|
||||||
|
|
||||||
@@ -178,14 +165,8 @@ impl<T: BeaconChainTypes> ActiveCustodyByRangeRequest<T> {
|
|||||||
// - peer custodies this column `index`
|
// - peer custodies this column `index`
|
||||||
// - peer claims to be synced to at least `slot`
|
// - peer claims to be synced to at least `slot`
|
||||||
//
|
//
|
||||||
// Therefore not returning this column is an protocol violation that we
|
// Then we penalize the faulty peer, mark it as failed and try with
|
||||||
// penalize and mark the peer as failed to retry with another peer.
|
// another.
|
||||||
//
|
|
||||||
// TODO(das) do not consider this case a success. We know for sure the block has
|
|
||||||
// data. However we allow the peer to return empty as we can't attribute fault.
|
|
||||||
// TODO(das): Should track which columns are missing and eventually give up
|
|
||||||
// TODO(das): If the peer is in the lookup peer set it claims to have imported
|
|
||||||
// the block AND its custody columns. So in this case we can downscore
|
|
||||||
Err(ColumnResponseError::MissingColumn(slot))
|
Err(ColumnResponseError::MissingColumn(slot))
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
@@ -219,15 +200,15 @@ impl<T: BeaconChainTypes> ActiveCustodyByRangeRequest<T> {
|
|||||||
));
|
));
|
||||||
}
|
}
|
||||||
ColumnResponseError::MissingColumn(slot) => {
|
ColumnResponseError::MissingColumn(slot) => {
|
||||||
missing_column_indexes.push((index, slot));
|
missing_column_indices.push((index, slot));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Log missing_column_indexes and incorrect_column_indices here in batch per request
|
// Log `imported_column_indices`, `missing_column_indexes` and
|
||||||
// to make this logs more compact and less noisy.
|
// `incorrect_column_indices` once per request to make the logs less noisy.
|
||||||
if !imported_column_indices.is_empty() {
|
if !imported_column_indices.is_empty() {
|
||||||
// TODO(das): this log may be redundant. We already log on DataColumnsByRange
|
// TODO(das): this log may be redundant. We already log on DataColumnsByRange
|
||||||
// completed, and on DataColumnsByRange sent we log the column indices
|
// completed, and on DataColumnsByRange sent we log the column indices
|
||||||
@@ -246,21 +227,18 @@ impl<T: BeaconChainTypes> ActiveCustodyByRangeRequest<T> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if !incorrect_column_indices.is_empty() {
|
if !incorrect_column_indices.is_empty() {
|
||||||
// Note: Batch logging that columns are missing to not spam logger
|
|
||||||
debug!(
|
debug!(
|
||||||
id = %self.id,
|
id = %self.id,
|
||||||
data_columns_by_range_req_id = %req_id,
|
data_columns_by_range_req_id = %req_id,
|
||||||
%peer_id,
|
%peer_id,
|
||||||
// TODO(das): this property can become very noisy, being the full range 0..128
|
?incorrect_column_indices,
|
||||||
incorrect_columns = ?incorrect_column_indices,
|
|
||||||
"Custody by range peer returned non-matching columns"
|
"Custody by range peer returned non-matching columns"
|
||||||
);
|
);
|
||||||
|
|
||||||
// Returning a non-canonical column is not a permanent fault. We should not
|
// Returning a non-canonical column is not a permanent fault. We should not
|
||||||
// retry the peer for some time but the peer may return a canonical column in
|
// retry the peer for some time but the peer may return a canonical column in
|
||||||
// the future.
|
// the future.
|
||||||
// TODO(das): if this finalized sync the fault is permanent
|
self.failed_peers.insert(peer_id);
|
||||||
self.peers_with_temporary_faults.insert(peer_id);
|
|
||||||
cx.report_peer(
|
cx.report_peer(
|
||||||
peer_id,
|
peer_id,
|
||||||
PeerAction::MidToleranceError,
|
PeerAction::MidToleranceError,
|
||||||
@@ -268,19 +246,17 @@ impl<T: BeaconChainTypes> ActiveCustodyByRangeRequest<T> {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
if !missing_column_indexes.is_empty() {
|
if !missing_column_indices.is_empty() {
|
||||||
// Note: Batch logging that columns are missing to not spam logger
|
|
||||||
debug!(
|
debug!(
|
||||||
id = %self.id,
|
id = %self.id,
|
||||||
data_columns_by_range_req_id = %req_id,
|
data_columns_by_range_req_id = %req_id,
|
||||||
%peer_id,
|
%peer_id,
|
||||||
// TODO(das): this property can become very noisy, being the full range 0..128
|
?missing_column_indices,
|
||||||
?missing_column_indexes,
|
|
||||||
"Custody by range peer claims to not have some data"
|
"Custody by range peer claims to not have some data"
|
||||||
);
|
);
|
||||||
|
|
||||||
// Not having columns is not a permanent fault. The peer may be backfilling.
|
// Not having columns is not a permanent fault. The peer may be backfilling.
|
||||||
self.peers_with_custody_failures.insert(peer_id);
|
self.failed_peers.insert(peer_id);
|
||||||
cx.report_peer(peer_id, PeerAction::MidToleranceError, "custody_failure");
|
cx.report_peer(peer_id, PeerAction::MidToleranceError, "custody_failure");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -293,7 +269,6 @@ impl<T: BeaconChainTypes> ActiveCustodyByRangeRequest<T> {
|
|||||||
"Custody by range download error"
|
"Custody by range download error"
|
||||||
);
|
);
|
||||||
|
|
||||||
// TODO(das): Should mark peer as failed and try from another peer
|
|
||||||
for column_index in &batch_request.indices {
|
for column_index in &batch_request.indices {
|
||||||
self.column_requests
|
self.column_requests
|
||||||
.get_mut(column_index)
|
.get_mut(column_index)
|
||||||
@@ -301,22 +276,8 @@ impl<T: BeaconChainTypes> ActiveCustodyByRangeRequest<T> {
|
|||||||
.on_download_error_and_mark_failure(req_id, err.clone())?;
|
.on_download_error_and_mark_failure(req_id, err.clone())?;
|
||||||
}
|
}
|
||||||
|
|
||||||
match err {
|
// An RpcResponseError is already downscored in network_context
|
||||||
// Verify errors are correctness errors against our request or about the
|
self.failed_peers.insert(peer_id);
|
||||||
// returned data itself. This peer is faulty or malicious, should not be
|
|
||||||
// retried.
|
|
||||||
RpcResponseError::VerifyError(_) => {
|
|
||||||
self.peers_with_permanent_faults.insert(peer_id);
|
|
||||||
}
|
|
||||||
// Network errors are not permanent faults and worth retrying
|
|
||||||
RpcResponseError::RpcError(_) => {
|
|
||||||
self.peers_with_temporary_faults.insert(peer_id);
|
|
||||||
}
|
|
||||||
// Do nothing for internal errors
|
|
||||||
RpcResponseError::InternalError(_) => {}
|
|
||||||
// unreachable
|
|
||||||
RpcResponseError::RequestExpired(_) => {}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -386,18 +347,13 @@ impl<T: BeaconChainTypes> ActiveCustodyByRangeRequest<T> {
|
|||||||
let mut priorized_peers = custodial_peers
|
let mut priorized_peers = custodial_peers
|
||||||
.iter()
|
.iter()
|
||||||
.filter(|peer| {
|
.filter(|peer| {
|
||||||
// Never request again peers with permanent faults
|
// Do not request faulty peers for some time
|
||||||
// Do not request peers with custody failures for some time
|
!self.failed_peers.contains(peer)
|
||||||
!self.peers_with_permanent_faults.contains(peer)
|
|
||||||
&& !self.peers_with_custody_failures.contains(peer)
|
|
||||||
})
|
})
|
||||||
.map(|peer| {
|
.map(|peer| {
|
||||||
(
|
(
|
||||||
// Prioritize peers that claim to know have imported this block
|
// Prioritize peers that claim to know have imported this block
|
||||||
if lookup_peers.contains(peer) { 0 } else { 1 },
|
if lookup_peers.contains(peer) { 0 } else { 1 },
|
||||||
// De-prioritize peers that have failed to successfully respond to
|
|
||||||
// requests recently, but allow to immediatelly request them again
|
|
||||||
self.peers_with_temporary_faults.contains(peer),
|
|
||||||
// Prefer peers with fewer requests to load balance across peers.
|
// Prefer peers with fewer requests to load balance across peers.
|
||||||
// We batch requests to the same peer, so count existence in the
|
// We batch requests to the same peer, so count existence in the
|
||||||
// `columns_to_request_by_peer` as a single 1 request.
|
// `columns_to_request_by_peer` as a single 1 request.
|
||||||
@@ -411,7 +367,7 @@ impl<T: BeaconChainTypes> ActiveCustodyByRangeRequest<T> {
|
|||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
priorized_peers.sort_unstable();
|
priorized_peers.sort_unstable();
|
||||||
|
|
||||||
if let Some((_, _, _, _, peer_id)) = priorized_peers.first() {
|
if let Some((_, _, _, peer_id)) = priorized_peers.first() {
|
||||||
columns_to_request_by_peer
|
columns_to_request_by_peer
|
||||||
.entry(*peer_id)
|
.entry(*peer_id)
|
||||||
.or_default()
|
.or_default()
|
||||||
|
|||||||
@@ -10,7 +10,9 @@ use itertools::Itertools;
|
|||||||
use lighthouse_network::service::api_types::Id;
|
use lighthouse_network::service::api_types::Id;
|
||||||
use lighthouse_network::{PeerAction, PeerId};
|
use lighthouse_network::{PeerAction, PeerId};
|
||||||
use logging::crit;
|
use logging::crit;
|
||||||
|
use parking_lot::RwLock;
|
||||||
use std::collections::{btree_map::Entry, BTreeMap, HashMap, HashSet};
|
use std::collections::{btree_map::Entry, BTreeMap, HashMap, HashSet};
|
||||||
|
use std::sync::Arc;
|
||||||
use strum::IntoStaticStr;
|
use strum::IntoStaticStr;
|
||||||
use tracing::{debug, instrument, warn};
|
use tracing::{debug, instrument, warn};
|
||||||
use types::{Epoch, EthSpec, Hash256, Slot};
|
use types::{Epoch, EthSpec, Hash256, Slot};
|
||||||
@@ -91,7 +93,11 @@ pub struct SyncingChain<T: BeaconChainTypes> {
|
|||||||
///
|
///
|
||||||
/// Also, For each peer tracks the total requests done per peer as part of this SyncingChain
|
/// Also, For each peer tracks the total requests done per peer as part of this SyncingChain
|
||||||
/// `HashMap<peer, total_requests_per_peer>`
|
/// `HashMap<peer, total_requests_per_peer>`
|
||||||
peers: HashMap<PeerId, usize>,
|
peers: Arc<RwLock<HashSet<PeerId>>>,
|
||||||
|
|
||||||
|
/// Tracks the total requests done to each peer for this SyncingChain. Forces us to fetch data
|
||||||
|
/// from all peers to prevent eclipse attacks
|
||||||
|
requests_per_peer: HashMap<PeerId, usize>,
|
||||||
|
|
||||||
/// Starting epoch of the next batch that needs to be downloaded.
|
/// Starting epoch of the next batch that needs to be downloaded.
|
||||||
to_be_downloaded: BatchId,
|
to_be_downloaded: BatchId,
|
||||||
@@ -173,7 +179,8 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
|||||||
target_head_slot,
|
target_head_slot,
|
||||||
target_head_root,
|
target_head_root,
|
||||||
batches: BTreeMap::new(),
|
batches: BTreeMap::new(),
|
||||||
peers: HashMap::from_iter([(peer_id, <_>::default())]),
|
peers: Arc::new(RwLock::new(HashSet::from_iter([peer_id]))),
|
||||||
|
requests_per_peer: HashMap::from_iter([(peer_id, <_>::default())]),
|
||||||
to_be_downloaded: start_epoch,
|
to_be_downloaded: start_epoch,
|
||||||
processing_target: start_epoch,
|
processing_target: start_epoch,
|
||||||
optimistic_start: None,
|
optimistic_start: None,
|
||||||
@@ -191,7 +198,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
|||||||
/// Check if the chain has peers from which to process batches.
|
/// Check if the chain has peers from which to process batches.
|
||||||
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||||
pub fn available_peers(&self) -> usize {
|
pub fn available_peers(&self) -> usize {
|
||||||
self.peers.len()
|
self.peers.read().len()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the chain's id.
|
/// Get the chain's id.
|
||||||
@@ -203,7 +210,12 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
|||||||
/// Peers currently syncing this chain.
|
/// Peers currently syncing this chain.
|
||||||
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||||
pub fn peers(&self) -> impl Iterator<Item = PeerId> + '_ {
|
pub fn peers(&self) -> impl Iterator<Item = PeerId> + '_ {
|
||||||
self.peers.keys().cloned()
|
self.peers
|
||||||
|
.read()
|
||||||
|
.iter()
|
||||||
|
.copied()
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.into_iter()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Progress in epochs made by the chain
|
/// Progress in epochs made by the chain
|
||||||
@@ -227,9 +239,10 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
|||||||
/// If the peer has active batches, those are considered failed and re-requested.
|
/// If the peer has active batches, those are considered failed and re-requested.
|
||||||
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||||
pub fn remove_peer(&mut self, peer_id: &PeerId) -> ProcessingResult {
|
pub fn remove_peer(&mut self, peer_id: &PeerId) -> ProcessingResult {
|
||||||
self.peers.remove(peer_id);
|
self.peers.write().remove(peer_id);
|
||||||
|
self.requests_per_peer.remove(peer_id);
|
||||||
|
|
||||||
if self.peers.is_empty() {
|
if self.peers.read().is_empty() {
|
||||||
Err(RemoveChain::EmptyPeerPool)
|
Err(RemoveChain::EmptyPeerPool)
|
||||||
} else {
|
} else {
|
||||||
Ok(KeepChain)
|
Ok(KeepChain)
|
||||||
@@ -259,7 +272,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
|||||||
// Account for one more requests to this peer
|
// Account for one more requests to this peer
|
||||||
// TODO(das): this code assumes that we do a single request per peer per RpcBlock
|
// TODO(das): this code assumes that we do a single request per peer per RpcBlock
|
||||||
for peer in batch_peers.iter_unique_peers() {
|
for peer in batch_peers.iter_unique_peers() {
|
||||||
*self.peers.entry(*peer).or_default() += 1;
|
*self.requests_per_peer.entry(*peer).or_default() += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// check if we have this batch
|
// check if we have this batch
|
||||||
@@ -613,7 +626,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
|||||||
"Batch failed to download. Dropping chain scoring peers"
|
"Batch failed to download. Dropping chain scoring peers"
|
||||||
);
|
);
|
||||||
|
|
||||||
for (peer, _) in self.peers.drain() {
|
for peer in self.peers.write().drain() {
|
||||||
network.report_peer(peer, penalty, "faulty_chain");
|
network.report_peer(peer, penalty, "faulty_chain");
|
||||||
}
|
}
|
||||||
Err(RemoveChain::ChainFailed {
|
Err(RemoveChain::ChainFailed {
|
||||||
@@ -878,7 +891,8 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
|||||||
network: &mut SyncNetworkContext<T>,
|
network: &mut SyncNetworkContext<T>,
|
||||||
peer_id: PeerId,
|
peer_id: PeerId,
|
||||||
) -> ProcessingResult {
|
) -> ProcessingResult {
|
||||||
self.peers.insert(peer_id, <_>::default());
|
self.peers.write().insert(peer_id);
|
||||||
|
self.requests_per_peer.insert(peer_id, <_>::default());
|
||||||
self.request_batches(network)
|
self.request_batches(network)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -952,26 +966,15 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
|||||||
let request = batch.to_blocks_by_range_request();
|
let request = batch.to_blocks_by_range_request();
|
||||||
let failed_peers = batch.failed_block_peers();
|
let failed_peers = batch.failed_block_peers();
|
||||||
|
|
||||||
// TODO(das): we should request only from peers that are part of this SyncingChain.
|
|
||||||
// However, then we hit the NoPeer error frequently which causes the batch to fail and
|
|
||||||
// the SyncingChain to be dropped. We need to handle this case more gracefully.
|
|
||||||
let synced_peers = network
|
|
||||||
.network_globals()
|
|
||||||
.peers
|
|
||||||
.read()
|
|
||||||
.synced_peers()
|
|
||||||
.cloned()
|
|
||||||
.collect::<HashSet<_>>();
|
|
||||||
|
|
||||||
match network.block_components_by_range_request(
|
match network.block_components_by_range_request(
|
||||||
request,
|
request,
|
||||||
RangeRequestId::RangeSync {
|
RangeRequestId::RangeSync {
|
||||||
chain_id: self.id,
|
chain_id: self.id,
|
||||||
batch_id,
|
batch_id,
|
||||||
},
|
},
|
||||||
&synced_peers,
|
self.peers.clone(),
|
||||||
&failed_peers,
|
&failed_peers,
|
||||||
&self.peers,
|
&self.requests_per_peer,
|
||||||
) {
|
) {
|
||||||
Ok(request_id) => {
|
Ok(request_id) => {
|
||||||
// inform the batch about the new request
|
// inform the batch about the new request
|
||||||
|
|||||||
Reference in New Issue
Block a user