mirror of
https://github.com/sigp/lighthouse.git
synced 2026-03-14 10:22:38 +00:00
Sync fixes (#801)
* Randomize peer selection for batch errors * Downgrade attestation logging * Handle range sync errors * Update lock file * Downgrade logs * Decrease batch size for better thread handling * Optimise peer selection in range sync
This commit is contained in:
@@ -36,14 +36,19 @@ impl SyncNetworkContext {
|
||||
chain: Weak<BeaconChain<T>>,
|
||||
peer_id: PeerId,
|
||||
) {
|
||||
trace!(
|
||||
self.log,
|
||||
"Sending Status Request";
|
||||
"method" => "STATUS",
|
||||
"peer" => format!("{:?}", peer_id)
|
||||
);
|
||||
if let Some(chain) = chain.upgrade() {
|
||||
if let Some(status_message) = status_message(&chain) {
|
||||
debug!(
|
||||
self.log,
|
||||
"Sending Status Request";
|
||||
"peer" => format!("{:?}", peer_id),
|
||||
"fork_version" => format!("{:?}", status_message.fork_version),
|
||||
"finalized_root" => format!("{:?}", status_message.finalized_root),
|
||||
"finalized_epoch" => format!("{:?}", status_message.finalized_epoch),
|
||||
"head_root" => format!("{}", status_message.head_root),
|
||||
"head_slot" => format!("{}", status_message.head_slot),
|
||||
);
|
||||
|
||||
let _ = self.send_rpc_request(peer_id, RPCRequest::Status(status_message));
|
||||
}
|
||||
}
|
||||
@@ -124,8 +129,7 @@ impl SyncNetworkContext {
|
||||
self.network_send
|
||||
.try_send(NetworkMessage::RPC(peer_id, rpc_event))
|
||||
.map_err(|_| {
|
||||
// This is likely to happen when shutting down. Suppress this warning to trace for now
|
||||
trace!(
|
||||
debug!(
|
||||
self.log,
|
||||
"Could not send RPC message to the network service"
|
||||
);
|
||||
|
||||
107
beacon_node/network/src/sync/range_sync/batch.rs
Normal file
107
beacon_node/network/src/sync/range_sync/batch.rs
Normal file
@@ -0,0 +1,107 @@
|
||||
use eth2_libp2p::rpc::RequestId;
|
||||
use eth2_libp2p::PeerId;
|
||||
use fnv::FnvHashMap;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use types::{BeaconBlock, EthSpec, Hash256, Slot};
|
||||
|
||||
/// A collection of sequential blocks that are requested from peers in a single RPC request.
|
||||
#[derive(PartialEq)]
|
||||
pub struct Batch<T: EthSpec> {
|
||||
/// The ID of the batch, these are sequential.
|
||||
pub id: u64,
|
||||
/// The requested start slot of the batch, inclusive.
|
||||
pub start_slot: Slot,
|
||||
/// The requested end slot of batch, exclusive.
|
||||
pub end_slot: Slot,
|
||||
/// The hash of the chain root to requested from the peer.
|
||||
pub head_root: Hash256,
|
||||
/// The peer that was originally assigned to the batch.
|
||||
pub _original_peer: PeerId,
|
||||
/// The peer that is currently assigned to the batch.
|
||||
pub current_peer: PeerId,
|
||||
/// The number of retries this batch has undergone.
|
||||
pub retries: u8,
|
||||
/// The blocks that have been downloaded.
|
||||
pub downloaded_blocks: Vec<BeaconBlock<T>>,
|
||||
}
|
||||
|
||||
impl<T: EthSpec> Ord for Batch<T> {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
self.id.cmp(&other.id)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: EthSpec> PartialOrd for Batch<T> {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
/// A structure that contains a mapping of pending batch requests, that also keeps track of which
|
||||
/// peers are currently making batch requests.
|
||||
///
|
||||
/// This is used to optimise searches for idle peers (peers that have no outbound batch requests).
|
||||
pub struct PendingBatches<T: EthSpec> {
|
||||
/// The current pending batches.
|
||||
batches: FnvHashMap<RequestId, Batch<T>>,
|
||||
/// A mapping of peers to the number of pending requests.
|
||||
peer_requests: HashMap<PeerId, HashSet<RequestId>>,
|
||||
}
|
||||
|
||||
impl<T: EthSpec> PendingBatches<T> {
|
||||
pub fn new() -> Self {
|
||||
PendingBatches {
|
||||
batches: FnvHashMap::default(),
|
||||
peer_requests: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, request_id: RequestId, batch: Batch<T>) -> Option<Batch<T>> {
|
||||
let peer_request = batch.current_peer.clone();
|
||||
self.peer_requests
|
||||
.entry(peer_request)
|
||||
.or_insert_with(|| HashSet::new())
|
||||
.insert(request_id);
|
||||
self.batches.insert(request_id, batch)
|
||||
}
|
||||
|
||||
pub fn remove(&mut self, request_id: &RequestId) -> Option<Batch<T>> {
|
||||
if let Some(batch) = self.batches.remove(request_id) {
|
||||
if let Entry::Occupied(mut entry) = self.peer_requests.entry(batch.current_peer.clone())
|
||||
{
|
||||
entry.get_mut().remove(request_id);
|
||||
|
||||
if entry.get().is_empty() {
|
||||
entry.remove();
|
||||
}
|
||||
}
|
||||
Some(batch)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Adds a block to the batches if the request id exists. Returns None if there is no batch
|
||||
/// matching the request id.
|
||||
pub fn add_block(&mut self, request_id: &RequestId, block: BeaconBlock<T>) -> Option<()> {
|
||||
let batch = self.batches.get_mut(request_id)?;
|
||||
batch.downloaded_blocks.push(block);
|
||||
Some(())
|
||||
}
|
||||
|
||||
/// Returns true if there the peer does not exist in the peer_requests mapping. Indicating it
|
||||
/// has no pending outgoing requests.
|
||||
pub fn peer_is_idle(&self, peer_id: &PeerId) -> bool {
|
||||
self.peer_requests.get(peer_id).is_none()
|
||||
}
|
||||
|
||||
/// Removes a batch for a given peer.
|
||||
pub fn remove_batch_by_peer(&mut self, peer_id: &PeerId) -> Option<Batch<T>> {
|
||||
let request_ids = self.peer_requests.get(peer_id)?;
|
||||
|
||||
let request_id = request_ids.iter().next()?.clone();
|
||||
self.remove(&request_id)
|
||||
}
|
||||
}
|
||||
@@ -1,12 +1,11 @@
|
||||
use crate::message_processor::FUTURE_SLOT_TOLERANCE;
|
||||
use crate::sync::network_context::SyncNetworkContext;
|
||||
use crate::sync::range_sync::batch::{Batch, PendingBatches};
|
||||
use beacon_chain::{BeaconChain, BeaconChainTypes, BlockProcessingOutcome};
|
||||
use eth2_libp2p::rpc::methods::*;
|
||||
use eth2_libp2p::rpc::RequestId;
|
||||
use eth2_libp2p::PeerId;
|
||||
use fnv::FnvHashMap;
|
||||
use slog::{crit, debug, error, trace, warn, Logger};
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::HashSet;
|
||||
use std::ops::Sub;
|
||||
use std::sync::Weak;
|
||||
@@ -18,44 +17,12 @@ use types::{BeaconBlock, EthSpec, Hash256, Slot};
|
||||
/// responder will fill the response up to the max request size, assuming they have the bandwidth
|
||||
/// to do so.
|
||||
//TODO: Make this dynamic based on peer's bandwidth
|
||||
const BLOCKS_PER_BATCH: u64 = 50;
|
||||
//TODO: This is lower due to current thread design. Modify once rebuilt.
|
||||
const BLOCKS_PER_BATCH: u64 = 25;
|
||||
|
||||
/// The number of times to retry a batch before the chain is considered failed and removed.
|
||||
const MAX_BATCH_RETRIES: u8 = 5;
|
||||
|
||||
/// A collection of sequential blocks that are requested from peers in a single RPC request.
|
||||
#[derive(PartialEq)]
|
||||
pub struct Batch<T: EthSpec> {
|
||||
/// The ID of the batch, batches are ID's sequentially.
|
||||
id: u64,
|
||||
/// The requested start slot of the batch, inclusive.
|
||||
start_slot: Slot,
|
||||
/// The requested end slot of batch, exclusive.
|
||||
end_slot: Slot,
|
||||
/// The hash of the chain root to requested from the peer.
|
||||
head_root: Hash256,
|
||||
/// The peer that was originally assigned to the batch.
|
||||
_original_peer: PeerId,
|
||||
/// The peer that is currently assigned to the batch.
|
||||
pub current_peer: PeerId,
|
||||
/// The number of retries this batch has undergone.
|
||||
retries: u8,
|
||||
/// The blocks that have been downloaded.
|
||||
downloaded_blocks: Vec<BeaconBlock<T>>,
|
||||
}
|
||||
|
||||
impl<T: EthSpec> Ord for Batch<T> {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
self.id.cmp(&other.id)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: EthSpec> PartialOrd for Batch<T> {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
/// A return type for functions that act on a `Chain` which informs the caller whether the chain
|
||||
/// has been completed and should be removed or to be kept if further processing is
|
||||
/// required.
|
||||
@@ -105,7 +72,7 @@ pub struct SyncingChain<T: BeaconChainTypes> {
|
||||
|
||||
/// The batches that are currently awaiting a response from a peer. An RPC request for these
|
||||
/// have been sent.
|
||||
pub pending_batches: FnvHashMap<RequestId, Batch<T::EthSpec>>,
|
||||
pub pending_batches: PendingBatches<T::EthSpec>,
|
||||
|
||||
/// The batches that have been downloaded and are awaiting processing and/or validation.
|
||||
completed_batches: Vec<Batch<T::EthSpec>>,
|
||||
@@ -151,7 +118,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
start_slot,
|
||||
target_head_slot,
|
||||
target_head_root,
|
||||
pending_batches: FnvHashMap::default(),
|
||||
pending_batches: PendingBatches::new(),
|
||||
completed_batches: Vec::new(),
|
||||
peer_pool,
|
||||
to_be_downloaded_id: 1,
|
||||
@@ -176,9 +143,8 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
log: &slog::Logger,
|
||||
) -> Option<ProcessingResult> {
|
||||
if let Some(block) = beacon_block {
|
||||
let batch = self.pending_batches.get_mut(&request_id)?;
|
||||
// This is not a stream termination, simply add the block to the request
|
||||
batch.downloaded_blocks.push(block.clone());
|
||||
self.pending_batches.add_block(&request_id, block.clone())?;
|
||||
return Some(ProcessingResult::KeepChain);
|
||||
} else {
|
||||
// A stream termination has been sent. This batch has ended. Process a completed batch.
|
||||
@@ -202,10 +168,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
// blocks for the peer.
|
||||
debug!(log, "Completed batch received"; "id"=>batch.id, "blocks"=>batch.downloaded_blocks.len(), "awaiting_batches" => self.completed_batches.len());
|
||||
|
||||
// The peer that completed this batch, may be re-requested if this batch doesn't complete
|
||||
// the chain and there is no error in processing
|
||||
let current_peer = batch.current_peer.clone();
|
||||
|
||||
// verify the range of received blocks
|
||||
// Note that the order of blocks is verified in block processing
|
||||
if let Some(last_slot) = batch.downloaded_blocks.last().map(|b| b.slot) {
|
||||
@@ -244,7 +206,9 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
|
||||
if self.state == ChainSyncingState::Syncing {
|
||||
// pre-emptively request more blocks from peers whilst we process current blocks,
|
||||
self.send_range_request(network, current_peer, log);
|
||||
if !self.send_range_request(network, log) {
|
||||
debug!(log, "No peer available for next batch.")
|
||||
}
|
||||
}
|
||||
|
||||
// Try and process batches sequentially in the ordered list.
|
||||
@@ -424,38 +388,22 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
.retain(|batch| batch.id >= last_processed_id.saturating_sub(1));
|
||||
}
|
||||
|
||||
// Now begin requesting blocks from the peer pool. Ignore any peers with currently
|
||||
// pending requests
|
||||
let pending_peers = self
|
||||
.pending_batches
|
||||
.values()
|
||||
.map(|batch| batch.current_peer.clone())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let peers = self
|
||||
.peer_pool
|
||||
.iter()
|
||||
.filter(|peer| !pending_peers.contains(peer))
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
for peer_id in peers {
|
||||
// send a blocks by range request to the peer
|
||||
self.send_range_request(network, peer_id, log);
|
||||
}
|
||||
// Now begin requesting blocks from the peer pool, until all peers are exhausted.
|
||||
while self.send_range_request(network, log) {}
|
||||
|
||||
self.state = ChainSyncingState::Syncing;
|
||||
}
|
||||
|
||||
/// A peer has been added.
|
||||
/// Add a peer to the chain.
|
||||
///
|
||||
/// If the chain is active, this starts requesting batches from this peer.
|
||||
pub fn peer_added(
|
||||
pub fn add_peer(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext,
|
||||
peer_id: PeerId,
|
||||
log: &slog::Logger,
|
||||
) {
|
||||
self.peer_pool.insert(peer_id.clone());
|
||||
// do not request blocks if the chain is not syncing
|
||||
if let ChainSyncingState::Stopped = self.state {
|
||||
debug!(log, "Peer added to a non-syncing chain"; "peer_id" => format!("{:?}", peer_id));
|
||||
@@ -463,7 +411,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
}
|
||||
|
||||
// find the next batch and request it from the peer
|
||||
self.send_range_request(network, peer_id, log);
|
||||
self.send_range_request(network, log);
|
||||
}
|
||||
|
||||
/// Sends a STATUS message to all peers in the peer pool.
|
||||
@@ -473,19 +421,31 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Requests the next required batch from the provided peer.
|
||||
fn send_range_request(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext,
|
||||
peer_id: PeerId,
|
||||
log: &slog::Logger,
|
||||
) {
|
||||
/// Requests the next required batch from a peer. Returns true, if there was a peer available
|
||||
/// to send a request and there are batches to request, false otherwise.
|
||||
fn send_range_request(&mut self, network: &mut SyncNetworkContext, log: &slog::Logger) -> bool {
|
||||
// find the next pending batch and request it from the peer
|
||||
if let Some(batch) = self.get_next_batch(peer_id) {
|
||||
debug!(log, "Requesting batch"; "start_slot" => batch.start_slot, "end_slot" => batch.end_slot, "id" => batch.id, "peer" => format!("{:?}", batch.current_peer), "head_root"=> format!("{}", batch.head_root));
|
||||
// send the batch
|
||||
self.send_batch(network, batch);
|
||||
if let Some(peer_id) = self.get_next_peer() {
|
||||
if let Some(batch) = self.get_next_batch(peer_id) {
|
||||
debug!(log, "Requesting batch"; "start_slot" => batch.start_slot, "end_slot" => batch.end_slot, "id" => batch.id, "peer" => format!("{:?}", batch.current_peer), "head_root"=> format!("{}", batch.head_root));
|
||||
// send the batch
|
||||
self.send_batch(network, batch);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Returns a peer if there exists a peer which does not currently have a pending request.
|
||||
///
|
||||
/// This is used to create the next request.
|
||||
fn get_next_peer(&self) -> Option<PeerId> {
|
||||
for peer in self.peer_pool.iter() {
|
||||
if self.pending_batches.peer_is_idle(peer) {
|
||||
return Some(peer.clone());
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Requests the provided batch from the provided peer.
|
||||
@@ -546,7 +506,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
if let Some(batch) = self.pending_batches.remove(&request_id) {
|
||||
warn!(log, "Batch failed. RPC Error"; "id" => batch.id, "retries" => batch.retries, "peer" => format!("{:?}", peer_id));
|
||||
|
||||
Some(self.failed_batch(network, batch))
|
||||
Some(self.failed_batch(network, batch, log))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
@@ -561,6 +521,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext,
|
||||
mut batch: Batch<T::EthSpec>,
|
||||
log: &Logger,
|
||||
) -> ProcessingResult {
|
||||
batch.retries += 1;
|
||||
|
||||
@@ -580,6 +541,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
.unwrap_or_else(|| current_peer);
|
||||
|
||||
batch.current_peer = new_peer.clone();
|
||||
debug!(log, "Re-Requesting batch"; "start_slot" => batch.start_slot, "end_slot" => batch.end_slot, "id" => batch.id, "peer" => format!("{:?}", batch.current_peer), "head_root"=> format!("{}", batch.head_root));
|
||||
self.send_batch(network, batch);
|
||||
ProcessingResult::KeepChain
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
//! This provides the logic for syncing a chain when the local node is far behind it's current
|
||||
//! peers.
|
||||
|
||||
mod batch;
|
||||
mod chain;
|
||||
mod chain_collection;
|
||||
mod range;
|
||||
|
||||
@@ -154,8 +154,7 @@ impl<T: BeaconChainTypes> RangeSync<T> {
|
||||
debug!(self.log, "Finalized chain exists, adding peer"; "peer_id" => format!("{:?}", peer_id), "target_root" => format!("{}", chain.target_head_root), "end_slot" => chain.target_head_slot, "start_slot"=> chain.start_slot);
|
||||
|
||||
// add the peer to the chain's peer pool
|
||||
chain.peer_pool.insert(peer_id.clone());
|
||||
chain.peer_added(network, peer_id, &self.log);
|
||||
chain.add_peer(network, peer_id, &self.log);
|
||||
|
||||
// check if the new peer's addition will favour a new syncing chain.
|
||||
self.chains.update_finalized(network, &self.log);
|
||||
@@ -189,8 +188,7 @@ impl<T: BeaconChainTypes> RangeSync<T> {
|
||||
debug!(self.log, "Adding peer to the existing head chain peer pool"; "head_root" => format!("{}",remote.head_root), "head_slot" => remote.head_slot, "peer_id" => format!("{:?}", peer_id));
|
||||
|
||||
// add the peer to the head's pool
|
||||
chain.peer_pool.insert(peer_id.clone());
|
||||
chain.peer_added(network, peer_id.clone(), &self.log);
|
||||
chain.add_peer(network, peer_id.clone(), &self.log);
|
||||
} else {
|
||||
// There are no other head chains that match this peer's status, create a new one, and
|
||||
let start_slot = std::cmp::min(local_info.head_slot, remote_finalized_slot);
|
||||
@@ -306,22 +304,16 @@ impl<T: BeaconChainTypes> RangeSync<T> {
|
||||
/// for this peer. If so we mark the batch as failed. The batch may then hit it's maximum
|
||||
/// retries. In this case, we need to remove the chain and re-status all the peers.
|
||||
fn remove_peer(&mut self, network: &mut SyncNetworkContext, peer_id: &PeerId) {
|
||||
let log_ref = &self.log;
|
||||
match self.chains.head_finalized_request(|chain| {
|
||||
if chain.peer_pool.remove(&peer_id) {
|
||||
if chain.peer_pool.remove(peer_id) {
|
||||
// this chain contained the peer
|
||||
let pending_batches_requests = chain
|
||||
.pending_batches
|
||||
.iter()
|
||||
.filter(|(_, batch)| batch.current_peer == *peer_id)
|
||||
.map(|(id, _)| id)
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
for request_id in pending_batches_requests {
|
||||
if let Some(batch) = chain.pending_batches.remove(&request_id) {
|
||||
if let ProcessingResult::RemoveChain = chain.failed_batch(network, batch) {
|
||||
// a single batch failed, remove the chain
|
||||
return Some(ProcessingResult::RemoveChain);
|
||||
}
|
||||
while let Some(batch) = chain.pending_batches.remove_batch_by_peer(peer_id) {
|
||||
if let ProcessingResult::RemoveChain =
|
||||
chain.failed_batch(network, batch, log_ref)
|
||||
{
|
||||
// a single batch failed, remove the chain
|
||||
return Some(ProcessingResult::RemoveChain);
|
||||
}
|
||||
}
|
||||
// peer removed from chain, no batch failed
|
||||
|
||||
Reference in New Issue
Block a user