Tweak head syncing (#1845)

## Issue Addressed

Fixes head syncing

## Proposed Changes

- Get back to statusing peers after removing chain segments and making the peer manager deal with status according to the Sync status, preventing an old known deadlock
- Also a bug where a chain would get removed if the optimistic batch succeeds being empty

## Additional Info

Tested on Medalla and looking good
This commit is contained in:
divma
2020-11-01 23:37:39 +00:00
parent 304793a6ab
commit 6c0c050fbb
7 changed files with 134 additions and 60 deletions

View File

@@ -19,7 +19,7 @@
//! need to be downloaded.
//!
//! A few interesting notes about finalized chain syncing:
//! - Only one finalized chain can sync at a time.
//! - Only one finalized chain can sync at a time
//! - The finalized chain with the largest peer pool takes priority.
//! - As one finalized chain completes, others are checked to see if we they can be continued,
//! otherwise they are removed.
@@ -39,7 +39,7 @@
//! Each chain is downloaded in batches of blocks. The batched blocks are processed sequentially
//! and further batches are requested as current blocks are being processed.
use super::chain::ChainId;
use super::chain::{ChainId, RemoveChain, SyncingChain};
use super::chain_collection::ChainCollection;
use super::sync_type::RangeSyncType;
use crate::beacon_processor::WorkEvent as BeaconWorkEvent;
@@ -49,7 +49,7 @@ use crate::sync::PeerSyncInfo;
use crate::sync::RequestId;
use beacon_chain::{BeaconChain, BeaconChainTypes};
use eth2_libp2p::PeerId;
use slog::{debug, error, trace};
use slog::{crit, debug, error, trace};
use std::collections::HashMap;
use std::sync::Arc;
use tokio::sync::mpsc;
@@ -121,7 +121,8 @@ impl<T: BeaconChainTypes> RangeSync<T> {
.finalized_epoch
.start_slot(T::EthSpec::slots_per_epoch());
// NOTE: A peer that has been re-status'd may now exist in multiple finalized chains.
// NOTE: A peer that has been re-status'd may now exist in multiple finalized chains. This
// is OK since we since only one finalized chain at a time.
// determine which kind of sync to perform and set up the chains
match RangeSyncType::new(&self.beacon_chain, &local_info, &remote_info) {
@@ -208,22 +209,22 @@ impl<T: BeaconChainTypes> RangeSync<T> {
chain.on_block_response(network, batch_id, &peer_id, request_id, beacon_block)
}) {
Ok((removed_chain, sync_type)) => {
if let Some(_removed_chain) = removed_chain {
debug!(self.log, "Chain removed after block response"; "sync_type" => ?sync_type, "chain_id" => chain_id);
// update the state of the collection
self.chains.update(
if let Some((removed_chain, remove_reason)) = removed_chain {
self.on_chain_removed(
removed_chain,
sync_type,
remove_reason,
network,
&mut self.awaiting_head_peers,
&self.beacon_processor_send,
"block response",
);
}
}
Err(_) => {
debug!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
}
}
} else {
debug!(self.log, "Response/Error for non registered request"; "request_id" => request_id)
trace!(self.log, "Response/Error for non registered request"; "request_id" => request_id)
}
}
@@ -241,17 +242,18 @@ impl<T: BeaconChainTypes> RangeSync<T> {
Ok((None, _sync_type)) => {
// Chain was found and not removed
}
Ok((Some(_removed_chain), sync_type)) => {
debug!(self.log, "Chain removed after processing result"; "chain" => chain_id, "sync_type" => ?sync_type);
self.chains.update(
Ok((Some((removed_chain, remove_reason)), sync_type)) => {
self.on_chain_removed(
removed_chain,
sync_type,
remove_reason,
network,
&mut self.awaiting_head_peers,
&self.beacon_processor_send,
"batch processing result",
);
}
Err(_) => {
debug!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
}
}
}
@@ -279,14 +281,16 @@ impl<T: BeaconChainTypes> RangeSync<T> {
.chains
.call_all(|chain| chain.remove_peer(peer_id, network))
{
debug!(self.log, "Chain removed after removing peer"; "sync_type" => ?sync_type, "chain" => removed_chain.get_id(), "reason" => ?remove_reason);
self.on_chain_removed(
removed_chain,
sync_type,
remove_reason,
network,
"peer removed",
);
// update the state of the collection
}
self.chains.update(
network,
&mut self.awaiting_head_peers,
&self.beacon_processor_send,
);
}
/// An RPC error has occurred.
@@ -307,21 +311,45 @@ impl<T: BeaconChainTypes> RangeSync<T> {
}) {
Ok((removed_chain, sync_type)) => {
if let Some((removed_chain, remove_reason)) = removed_chain {
debug!(self.log, "Chain removed on rpc error"; "sync_type" => ?sync_type, "chain" => removed_chain.get_id(), "reason" => ?remove_reason);
// update the state of the collection
self.chains.update(
self.on_chain_removed(
removed_chain,
sync_type,
remove_reason,
network,
&mut self.awaiting_head_peers,
&self.beacon_processor_send,
"RPC error",
);
}
}
Err(_) => {
debug!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
}
}
} else {
debug!(self.log, "Response/Error for non registered request"; "request_id" => request_id)
trace!(self.log, "Response/Error for non registered request"; "request_id" => request_id)
}
}
fn on_chain_removed(
&mut self,
chain: SyncingChain<T>,
sync_type: RangeSyncType,
remove_reason: RemoveChain,
network: &mut SyncNetworkContext<T::EthSpec>,
op: &'static str,
) {
if remove_reason.is_critical() {
crit!(self.log, "Chain removed"; "sync_type" => ?sync_type, &chain, "reason" => ?remove_reason, "op" => op);
} else {
debug!(self.log, "Chain removed"; "sync_type" => ?sync_type, &chain, "reason" => ?remove_reason, "op" => op);
}
network.status_peers(self.beacon_chain.clone(), chain.peers());
// update the state of the collection
self.chains.update(
network,
&mut self.awaiting_head_peers,
&self.beacon_processor_send,
);
}
}