Implement checkpoint sync (#2244)

## Issue Addressed

Closes #1891
Closes #1784

## Proposed Changes

Implement checkpoint sync for Lighthouse, enabling it to start from a weak subjectivity checkpoint.

## Additional Info

- [x] Return unavailable status for out-of-range blocks requested by peers (#2561)
- [x] Implement sync daemon for fetching historical blocks (#2561)
- [x] Verify chain hashes (either in `historical_blocks.rs` or the calling module)
- [x] Consistency check for initial block + state
- [x] Fetch the initial state and block from a beacon node HTTP endpoint
- [x] Don't crash fetching beacon states by slot from the API
- [x] Background service for state reconstruction, triggered by CLI flag or API call.

Considered out of scope for this PR:

- Drop the requirement to provide the `--checkpoint-block` (this would require some pretty heavy refactoring of block verification)


Co-authored-by: Diva M <divma@protonmail.com>
This commit is contained in:
Michael Sproul
2021-09-22 00:37:28 +00:00
parent 280e4fe23d
commit 9667dc2f03
71 changed files with 4012 additions and 459 deletions

View File

@@ -2,7 +2,7 @@ use crate::beacon_processor::worker::FUTURE_SLOT_TOLERANCE;
use crate::service::NetworkMessage;
use crate::status::ToStatusMessage;
use crate::sync::SyncMessage;
use beacon_chain::{BeaconChainError, BeaconChainTypes, WhenSlotSkipped};
use beacon_chain::{BeaconChainError, BeaconChainTypes, HistoricalBlockError, WhenSlotSkipped};
use eth2_libp2p::rpc::StatusMessage;
use eth2_libp2p::rpc::*;
use eth2_libp2p::{PeerId, PeerRequestId, ReportSource, Response, SyncInfo};
@@ -38,6 +38,21 @@ impl<T: BeaconChainTypes> Worker<T> {
})
}
pub fn send_error_response(
&self,
peer_id: PeerId,
error: RPCResponseErrorCode,
reason: String,
id: PeerRequestId,
) {
self.send_network_message(NetworkMessage::SendErrorResponse {
peer_id,
error,
reason,
id,
})
}
/* Processing functions */
/// Process a `Status` message to determine if a peer is relevant to us. If the peer is
@@ -163,6 +178,20 @@ impl<T: BeaconChainTypes> Worker<T> {
.forwards_iter_block_roots(Slot::from(req.start_slot))
{
Ok(iter) => iter,
Err(BeaconChainError::HistoricalBlockError(
HistoricalBlockError::BlockOutOfRange {
slot,
oldest_block_slot,
},
)) => {
debug!(self.log, "Range request failed during backfill"; "requested_slot" => slot, "oldest_known_slot" => oldest_block_slot);
return self.send_error_response(
peer_id,
RPCResponseErrorCode::ResourceUnavailable,
"Backfilling".into(),
request_id,
);
}
Err(e) => return error!(self.log, "Unable to obtain root iter"; "error" => ?e),
};

View File

@@ -2,9 +2,11 @@ use super::{super::work_reprocessing_queue::ReprocessQueueMessage, Worker};
use crate::beacon_processor::worker::FUTURE_SLOT_TOLERANCE;
use crate::beacon_processor::BlockResultSender;
use crate::metrics;
use crate::sync::manager::SyncMessage;
use crate::sync::manager::{SyncMessage, SyncRequestType};
use crate::sync::{BatchProcessResult, ChainId};
use beacon_chain::{BeaconChainTypes, BlockError, ChainSegmentResult};
use beacon_chain::{
BeaconChainError, BeaconChainTypes, BlockError, ChainSegmentResult, HistoricalBlockError,
};
use eth2_libp2p::PeerId;
use slog::{crit, debug, error, info, trace, warn};
use tokio::sync::mpsc;
@@ -15,6 +17,8 @@ use types::{Epoch, Hash256, SignedBeaconBlock};
pub enum ProcessId {
/// Processing Id of a range syncing batch.
RangeBatchId(ChainId, Epoch),
/// Processing ID for a backfill syncing batch.
BackSyncBatchId(Epoch),
/// Processing Id of the parent lookup of a block.
ParentLookup(PeerId, Hash256),
}
@@ -99,11 +103,40 @@ impl<T: BeaconChainTypes> Worker<T> {
}
};
self.send_sync_message(SyncMessage::BatchProcessed {
chain_id,
epoch,
result,
});
let sync_type = SyncRequestType::RangeSync(epoch, chain_id);
self.send_sync_message(SyncMessage::BatchProcessed { sync_type, result });
}
// this a request from the Backfill sync
ProcessId::BackSyncBatchId(epoch) => {
let start_slot = downloaded_blocks.first().map(|b| b.slot().as_u64());
let end_slot = downloaded_blocks.last().map(|b| b.slot().as_u64());
let sent_blocks = downloaded_blocks.len();
let result = match self.process_backfill_blocks(&downloaded_blocks) {
(_, Ok(_)) => {
debug!(self.log, "Backfill batch processed";
"batch_epoch" => epoch,
"first_block_slot" => start_slot,
"last_block_slot" => end_slot,
"processed_blocks" => sent_blocks,
"service"=> "sync");
BatchProcessResult::Success(sent_blocks > 0)
}
(_, Err(e)) => {
debug!(self.log, "Backfill batch processing failed";
"batch_epoch" => epoch,
"first_block_slot" => start_slot,
"last_block_slot" => end_slot,
"error" => e,
"service" => "sync");
BatchProcessResult::Failed(false)
}
};
let sync_type = SyncRequestType::BackFillSync(epoch);
self.send_sync_message(SyncMessage::BatchProcessed { sync_type, result });
}
// this is a parent lookup request from the sync manager
ProcessId::ParentLookup(peer_id, chain_head) => {
@@ -160,6 +193,80 @@ impl<T: BeaconChainTypes> Worker<T> {
}
}
/// Helper function to process backfill block batches which only consumes the chain and blocks to process.
fn process_backfill_blocks(
&self,
blocks: &[SignedBeaconBlock<T::EthSpec>],
) -> (usize, Result<(), String>) {
match self.chain.import_historical_block_batch(blocks) {
Ok(imported_blocks) => {
metrics::inc_counter(
&metrics::BEACON_PROCESSOR_BACKFILL_CHAIN_SEGMENT_SUCCESS_TOTAL,
);
(imported_blocks, Ok(()))
}
Err(error) => {
metrics::inc_counter(
&metrics::BEACON_PROCESSOR_BACKFILL_CHAIN_SEGMENT_FAILED_TOTAL,
);
let err = match error {
// Handle the historical block errors specifically
BeaconChainError::HistoricalBlockError(e) => match e {
HistoricalBlockError::MismatchedBlockRoot {
block_root,
expected_block_root,
} => {
debug!(
self.log,
"Backfill batch processing error";
"error" => "mismatched_block_root",
"block_root" => ?block_root,
"expected_root" => ?expected_block_root
);
String::from("mismatched_block_root")
}
HistoricalBlockError::InvalidSignature
| HistoricalBlockError::SignatureSet(_) => {
warn!(
self.log,
"Backfill batch processing error";
"error" => ?e
);
"invalid_signature".into()
}
HistoricalBlockError::ValidatorPubkeyCacheTimeout => {
warn!(
self.log,
"Backfill batch processing error";
"error" => "pubkey_cache_timeout"
);
"pubkey_cache_timeout".into()
}
HistoricalBlockError::NoAnchorInfo => {
warn!(self.log, "Backfill not required");
String::from("no_anchor_info")
}
HistoricalBlockError::IndexOutOfBounds
| HistoricalBlockError::BlockOutOfRange { .. } => {
error!(
self.log,
"Backfill batch processing error";
"error" => ?e,
);
String::from("logic_error")
}
},
other => {
warn!(self.log, "Backfill batch processing error"; "error" => ?other);
format!("{:?}", other)
}
};
(0, Err(err))
}
}
}
/// Runs fork-choice on a given chain. This is used during block processing after one successful
/// block import.
fn run_fork_choice(&self) {

View File

@@ -338,10 +338,18 @@ lazy_static! {
"beacon_processor_chain_segment_success_total",
"Total number of chain segments successfully processed."
);
pub static ref BEACON_PROCESSOR_BACKFILL_CHAIN_SEGMENT_SUCCESS_TOTAL: Result<IntCounter> = try_create_int_counter(
"beacon_processor_backfill_chain_segment_success_total",
"Total number of chain segments successfully processed."
);
pub static ref BEACON_PROCESSOR_CHAIN_SEGMENT_FAILED_TOTAL: Result<IntCounter> = try_create_int_counter(
"beacon_processor_chain_segment_failed_total",
"Total number of chain segments that failed processing."
);
pub static ref BEACON_PROCESSOR_BACKFILL_CHAIN_SEGMENT_FAILED_TOTAL: Result<IntCounter> = try_create_int_counter(
"beacon_processor_backfill_chain_segment_failed_total",
"Total number of backfill chain segments that failed processing."
);
// Unaggregated attestations.
pub static ref BEACON_PROCESSOR_UNAGGREGATED_ATTESTATION_QUEUE_TOTAL: Result<IntGauge> = try_create_int_gauge(
"beacon_processor_unaggregated_attestation_queue_total",

View File

@@ -418,7 +418,7 @@ impl<T: EthSpec> HandlerNetworkContext<T> {
error: RPCResponseErrorCode,
reason: String,
) {
self.inform_network(NetworkMessage::SendError {
self.inform_network(NetworkMessage::SendErrorResponse {
peer_id,
error,
id,

View File

@@ -63,10 +63,8 @@ pub enum NetworkMessage<T: EthSpec> {
response: Response<T>,
id: PeerRequestId,
},
/// Respond to a peer's request with an error.
SendError {
// NOTE: Currently this is never used, we just say goodbye without nicely closing the
// stream assigned to the request
/// Sends an error response to an RPC request.
SendErrorResponse {
peer_id: PeerId,
error: RPCResponseErrorCode,
reason: String,
@@ -386,7 +384,7 @@ fn spawn_service<T: BeaconChainTypes>(
NetworkMessage::SendResponse{ peer_id, response, id } => {
service.libp2p.send_response(peer_id, id, response);
}
NetworkMessage::SendError{ peer_id, error, id, reason } => {
NetworkMessage::SendErrorResponse{ peer_id, error, id, reason } => {
service.libp2p.respond_with_error(peer_id, id, error, reason);
}
NetworkMessage::UPnPMappingEstablished { tcp_socket, udp_socket} => {

File diff suppressed because it is too large Load Diff

View File

@@ -33,6 +33,7 @@
//! needs to be searched for (i.e if an attestation references an unknown block) this manager can
//! search for the block and subsequently search for parents if needed.
use super::backfill_sync::{BackFillSync, ProcessResult, SyncStart};
use super::network_context::SyncNetworkContext;
use super::peer_sync_info::{remote_sync_type, PeerSyncType};
use super::range_sync::{ChainId, RangeSync, RangeSyncType, EPOCHS_PER_BATCH};
@@ -77,14 +78,14 @@ pub enum SyncMessage<T: EthSpec> {
/// A useful peer has been discovered.
AddPeer(PeerId, SyncInfo),
/// A `BlocksByRange` response has been received.
/// A [`BlocksByRange`] response has been received.
BlocksByRangeResponse {
peer_id: PeerId,
request_id: RequestId,
beacon_block: Option<Box<SignedBeaconBlock<T>>>,
},
/// A `BlocksByRoot` response has been received.
/// A [`BlocksByRoot`] response has been received.
BlocksByRootResponse {
peer_id: PeerId,
request_id: RequestId,
@@ -106,8 +107,7 @@ pub enum SyncMessage<T: EthSpec> {
/// A batch has been processed by the block processor thread.
BatchProcessed {
chain_id: ChainId,
epoch: Epoch,
sync_type: SyncRequestType,
result: BatchProcessResult,
},
@@ -120,6 +120,15 @@ pub enum SyncMessage<T: EthSpec> {
},
}
/// The type of sync request made
#[derive(Debug, Clone)]
pub enum SyncRequestType {
/// Request was from the backfill sync algorithm.
BackFillSync(Epoch),
/// The request was from a chain in the range sync algorithm.
RangeSync(Epoch, ChainId),
}
/// The result of processing a multiple blocks (a chain segment).
#[derive(Debug)]
pub enum BatchProcessResult {
@@ -166,6 +175,9 @@ pub struct SyncManager<T: BeaconChainTypes> {
/// The object handling long-range batch load-balanced syncing.
range_sync: RangeSync<T>,
/// Backfill syncing.
backfill_sync: BackFillSync<T>,
/// A collection of parent block lookups.
parent_queue: SmallVec<[ParentRequests<T::EthSpec>; 3]>,
@@ -227,6 +239,12 @@ pub fn spawn<T: BeaconChainTypes>(
beacon_processor_send.clone(),
log.clone(),
),
backfill_sync: BackFillSync::new(
beacon_chain.clone(),
network_globals.clone(),
beacon_processor_send.clone(),
log.clone(),
),
network: SyncNetworkContext::new(network_send, network_globals.clone(), log.clone()),
chain: beacon_chain,
network_globals,
@@ -576,6 +594,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
}
}
/// Handles RPC errors related to requests that were emitted from the sync manager.
fn inject_error(&mut self, peer_id: PeerId, request_id: RequestId) {
trace!(self.log, "Sync manager received a failed RPC");
// remove any single block lookups
@@ -597,14 +616,16 @@ impl<T: BeaconChainTypes> SyncManager<T> {
return;
}
// otherwise, this is a range sync issue, notify the range sync
self.range_sync
.inject_error(&mut self.network, peer_id, request_id);
self.update_sync_state();
// Otherwise this error matches no known request.
trace!(self.log, "Response/Error for non registered request"; "request_id" => request_id)
}
fn peer_disconnect(&mut self, peer_id: &PeerId) {
self.range_sync.peer_disconnect(&mut self.network, peer_id);
// Regardless of the outcome, we update the sync status.
let _ = self
.backfill_sync
.peer_disconnected(peer_id, &mut self.network);
self.update_sync_state();
}
@@ -624,12 +645,18 @@ impl<T: BeaconChainTypes> SyncManager<T> {
if let Some(peer_info) = self.network_globals.peers.write().peer_info_mut(peer_id) {
let new_state = sync_type.as_sync_status(remote_sync_info);
let rpr = new_state.as_str();
let was_updated = peer_info.sync_status.update(new_state);
let was_updated = peer_info.sync_status.update(new_state.clone());
if was_updated {
debug!(self.log, "Peer transitioned sync state"; "peer_id" => %peer_id, "new_state" => rpr,
"our_head_slot" => local_sync_info.head_slot, "out_finalized_epoch" => local_sync_info.finalized_epoch,
"their_head_slot" => remote_sync_info.head_slot, "their_finalized_epoch" => remote_sync_info.finalized_epoch,
"is_connected" => peer_info.is_connected());
// A peer has transitioned its sync state. If the new state is "synced" we
// inform the backfill sync that a new synced peer has joined us.
if new_state.is_synced() {
self.backfill_sync.fully_synced_peer_joined();
}
}
peer_info.is_connected()
} else {
@@ -638,7 +665,17 @@ impl<T: BeaconChainTypes> SyncManager<T> {
}
}
/// Updates the global sync state and logs any changes.
/// Updates the global sync state, optionally instigating or pausing a backfill sync as well as
/// logging any changes.
///
/// The logic for which sync should be running is as follows:
/// - If there is a range-sync running (or required) pause any backfill and let range-sync
/// complete.
/// - If there is no current range sync, check for any requirement to backfill and either
/// start/resume a backfill sync if required. The global state will be BackFillSync if a
/// backfill sync is running.
/// - If there is no range sync and no required backfill and we have synced up to the currently
/// known peers, we consider ourselves synced.
fn update_sync_state(&mut self) {
let new_state: SyncState = match self.range_sync.state() {
Err(e) => {
@@ -647,41 +684,75 @@ impl<T: BeaconChainTypes> SyncManager<T> {
}
Ok(state) => match state {
None => {
// no range sync, decide if we are stalled or synced.
// No range sync, so we decide if we are stalled or synced.
// For this we check if there is at least one advanced peer. An advanced peer
// with Idle range is possible since a peer's status is updated periodically.
// If we synced a peer between status messages, most likely the peer has
// advanced and will produce a head chain on re-status. Otherwise it will shift
// to being synced
let head = self.chain.best_slot().unwrap_or_else(|_| Slot::new(0));
let current_slot = self.chain.slot().unwrap_or_else(|_| Slot::new(0));
let mut sync_state = {
let head = self.chain.best_slot().unwrap_or_else(|_| Slot::new(0));
let current_slot = self.chain.slot().unwrap_or_else(|_| Slot::new(0));
let peers = self.network_globals.peers.read();
if current_slot >= head
&& current_slot.sub(head) <= (SLOT_IMPORT_TOLERANCE as u64)
&& head > 0
{
SyncState::Synced
} else if peers.advanced_peers().next().is_some() {
SyncState::SyncTransition
} else if peers.synced_peers().next().is_none() {
SyncState::Stalled
} else {
// There are no peers that require syncing and we have at least one synced
// peer
SyncState::Synced
let peers = self.network_globals.peers.read();
if current_slot >= head
&& current_slot.sub(head) <= (SLOT_IMPORT_TOLERANCE as u64)
&& head > 0
{
SyncState::Synced
} else if peers.advanced_peers().next().is_some() {
SyncState::SyncTransition
} else if peers.synced_peers().next().is_none() {
SyncState::Stalled
} else {
// There are no peers that require syncing and we have at least one synced
// peer
SyncState::Synced
}
};
// If we would otherwise be synced, first check if we need to perform or
// complete a backfill sync.
if matches!(sync_state, SyncState::Synced) {
// Determine if we need to start/resume/restart a backfill sync.
match self.backfill_sync.start(&mut self.network) {
Ok(SyncStart::Syncing {
completed,
remaining,
}) => {
sync_state = SyncState::BackFillSyncing {
completed,
remaining,
};
}
Ok(SyncStart::NotSyncing) => {} // Ignore updating the state if the backfill sync state didn't start.
Err(e) => {
error!(self.log, "Backfill sync failed to start"; "error" => ?e);
}
}
}
// Return the sync state if backfilling is not required.
sync_state
}
Some((RangeSyncType::Finalized, start_slot, target_slot)) => {
// If there is a backfill sync in progress pause it.
self.backfill_sync.pause();
SyncState::SyncingFinalized {
start_slot,
target_slot,
}
}
Some((RangeSyncType::Head, start_slot, target_slot)) => SyncState::SyncingHead {
start_slot,
target_slot,
},
Some((RangeSyncType::Head, start_slot, target_slot)) => {
// If there is a backfill sync in progress pause it.
self.backfill_sync.pause();
SyncState::SyncingHead {
start_slot,
target_slot,
}
}
},
};
@@ -690,7 +761,14 @@ impl<T: BeaconChainTypes> SyncManager<T> {
if !new_state.eq(&old_state) {
info!(self.log, "Sync state updated"; "old_state" => %old_state, "new_state" => %new_state);
// If we have become synced - Subscribe to all the core subnet topics
if new_state.is_synced() {
// We don't need to subscribe if the old state is a state that would have already
// invoked this call.
if new_state.is_synced()
&& !matches!(
old_state,
SyncState::Synced { .. } | SyncState::BackFillSyncing { .. }
)
{
self.network.subscribe_core_topics();
}
}
@@ -828,14 +906,13 @@ impl<T: BeaconChainTypes> SyncManager<T> {
// peer. We don't consider this chain a failure and prevent retries with another
// peer.
"too many failed attempts"
} else {
if !parent_request.downloaded_blocks.is_empty() {
self.failed_chains
.insert(parent_request.downloaded_blocks[0].canonical_root());
} else {
crit!(self.log, "Parent lookup has no blocks");
}
} else if !parent_request.downloaded_blocks.is_empty() {
self.failed_chains
.insert(parent_request.downloaded_blocks[0].canonical_root());
"reached maximum lookup-depth"
} else {
crit!(self.log, "Parent lookup has no blocks");
"no blocks"
};
debug!(self.log, "Parent import failed";
@@ -887,13 +964,44 @@ impl<T: BeaconChainTypes> SyncManager<T> {
request_id,
beacon_block,
} => {
self.range_sync.blocks_by_range_response(
&mut self.network,
peer_id,
request_id,
beacon_block.map(|b| *b),
);
self.update_sync_state();
let beacon_block = beacon_block.map(|b| *b);
// Obtain which sync requested these blocks and divert accordingly.
match self
.network
.blocks_by_range_response(request_id, beacon_block.is_none())
{
Some(SyncRequestType::RangeSync(batch_id, chain_id)) => {
self.range_sync.blocks_by_range_response(
&mut self.network,
peer_id,
chain_id,
batch_id,
request_id,
beacon_block,
);
self.update_sync_state();
}
Some(SyncRequestType::BackFillSync(batch_id)) => {
match self.backfill_sync.on_block_response(
&mut self.network,
batch_id,
&peer_id,
request_id,
beacon_block,
) {
Ok(ProcessResult::SyncCompleted) => self.update_sync_state(),
Ok(ProcessResult::Successful) => {}
Err(_error) => {
// The backfill sync has failed, errors are reported
// within.
self.update_sync_state();
}
}
}
None => {
trace!(self.log, "Response/Error for non registered request"; "request_id" => request_id)
}
}
}
SyncMessage::BlocksByRootResponse {
peer_id,
@@ -913,21 +1021,63 @@ impl<T: BeaconChainTypes> SyncManager<T> {
self.peer_disconnect(&peer_id);
}
SyncMessage::RPCError(peer_id, request_id) => {
self.inject_error(peer_id, request_id);
}
SyncMessage::BatchProcessed {
chain_id,
epoch,
result,
} => {
self.range_sync.handle_block_process_result(
&mut self.network,
chain_id,
epoch,
result,
);
self.update_sync_state();
// Redirect to a sync mechanism if the error is related to one of their
// requests.
match self.network.blocks_by_range_response(request_id, true) {
Some(SyncRequestType::RangeSync(batch_id, chain_id)) => {
self.range_sync.inject_error(
&mut self.network,
peer_id,
batch_id,
chain_id,
request_id,
);
self.update_sync_state();
}
Some(SyncRequestType::BackFillSync(batch_id)) => {
match self.backfill_sync.inject_error(
&mut self.network,
batch_id,
&peer_id,
request_id,
) {
Ok(_) => {}
Err(_) => self.update_sync_state(),
}
}
None => {
// This is a request not belonging to a sync algorithm.
// Process internally.
self.inject_error(peer_id, request_id);
}
}
}
SyncMessage::BatchProcessed { sync_type, result } => match sync_type {
SyncRequestType::RangeSync(epoch, chain_id) => {
self.range_sync.handle_block_process_result(
&mut self.network,
chain_id,
epoch,
result,
);
self.update_sync_state();
}
SyncRequestType::BackFillSync(epoch) => {
match self.backfill_sync.on_batch_process_result(
&mut self.network,
epoch,
&result,
) {
Ok(ProcessResult::Successful) => {}
Ok(ProcessResult::SyncCompleted) => self.update_sync_state(),
Err(error) => {
error!(self.log, "Backfill sync failed"; "error" => ?error);
// Update the global status
self.update_sync_state();
}
}
}
},
SyncMessage::ParentLookupFailed {
chain_head,
peer_id,

View File

@@ -1,6 +1,7 @@
//! Syncing for lighthouse.
//!
//! Stores the various syncing methods for the beacon chain.
mod backfill_sync;
pub mod manager;
mod network_context;
mod peer_sync_info;

View File

@@ -1,6 +1,7 @@
//! Provides network functionality for the Syncing thread. This fundamentally wraps a network
//! channel and stores a global RPC ID to perform requests.
use super::manager::SyncRequestType;
use super::range_sync::{BatchId, ChainId};
use super::RequestId as SyncRequestId;
use crate::service::NetworkMessage;
@@ -26,8 +27,8 @@ pub struct SyncNetworkContext<T: EthSpec> {
/// A sequential ID for all RPC requests.
request_id: SyncRequestId,
/// BlocksByRange requests made by range syncing chains.
range_requests: FnvHashMap<SyncRequestId, (ChainId, BatchId)>,
/// BlocksByRange requests made by syncing algorithms.
range_requests: FnvHashMap<SyncRequestId, SyncRequestType>,
/// Logger for the `SyncNetworkContext`.
log: slog::Logger,
@@ -81,6 +82,7 @@ impl<T: EthSpec> SyncNetworkContext<T> {
}
}
/// A blocks by range request for the range sync algorithm.
pub fn blocks_by_range_request(
&mut self,
peer_id: PeerId,
@@ -96,15 +98,37 @@ impl<T: EthSpec> SyncNetworkContext<T> {
"peer" => %peer_id,
);
let req_id = self.send_rpc_request(peer_id, Request::BlocksByRange(request))?;
self.range_requests.insert(req_id, (chain_id, batch_id));
self.range_requests
.insert(req_id, SyncRequestType::RangeSync(batch_id, chain_id));
Ok(req_id)
}
/// A blocks by range request sent by the backfill sync algorithm
pub fn backfill_blocks_by_range_request(
&mut self,
peer_id: PeerId,
request: BlocksByRangeRequest,
batch_id: BatchId,
) -> Result<SyncRequestId, &'static str> {
trace!(
self.log,
"Sending backfill BlocksByRange Request";
"method" => "BlocksByRange",
"count" => request.count,
"peer" => %peer_id,
);
let req_id = self.send_rpc_request(peer_id, Request::BlocksByRange(request))?;
self.range_requests
.insert(req_id, SyncRequestType::BackFillSync(batch_id));
Ok(req_id)
}
/// Received a blocks by range response.
pub fn blocks_by_range_response(
&mut self,
request_id: usize,
remove: bool,
) -> Option<(ChainId, BatchId)> {
) -> Option<SyncRequestType> {
// NOTE: we can't guarantee that the request must be registered as it could receive more
// than an error, and be removed after receiving the first one.
// FIXME: https://github.com/sigp/lighthouse/issues/1634
@@ -115,6 +139,7 @@ impl<T: EthSpec> SyncNetworkContext<T> {
}
}
/// Sends a blocks by root request.
pub fn blocks_by_root_request(
&mut self,
peer_id: PeerId,
@@ -130,6 +155,7 @@ impl<T: EthSpec> SyncNetworkContext<T> {
self.send_rpc_request(peer_id, Request::BlocksByRoot(request))
}
/// Terminates the connection with the peer and bans them.
pub fn goodbye_peer(&mut self, peer_id: PeerId, reason: GoodbyeReason) {
self.network_send
.send(NetworkMessage::GoodbyePeer {
@@ -142,6 +168,7 @@ impl<T: EthSpec> SyncNetworkContext<T> {
});
}
/// Reports to the scoring algorithm the behaviour of a peer.
pub fn report_peer(&mut self, peer_id: PeerId, action: PeerAction) {
debug!(self.log, "Sync reporting peer"; "peer_id" => %peer_id, "action" => %action);
self.network_send
@@ -155,7 +182,8 @@ impl<T: EthSpec> SyncNetworkContext<T> {
});
}
pub fn send_rpc_request(
/// Sends an RPC request.
fn send_rpc_request(
&mut self,
peer_id: PeerId,
request: Request,
@@ -170,6 +198,7 @@ impl<T: EthSpec> SyncNetworkContext<T> {
Ok(request_id)
}
/// Subscribes to core topics.
pub fn subscribe_core_topics(&mut self) {
self.network_send
.send(NetworkMessage::SubscribeCoreTopics)
@@ -178,6 +207,7 @@ impl<T: EthSpec> SyncNetworkContext<T> {
});
}
/// Sends an arbitrary network message.
fn send_network_msg(&mut self, msg: NetworkMessage<T>) -> Result<(), &'static str> {
self.network_send.send(msg).map_err(|_| {
debug!(self.log, "Could not send message to the network service");

View File

@@ -14,15 +14,34 @@ const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 5;
/// after `MAX_BATCH_PROCESSING_ATTEMPTS` times, it is considered faulty.
const MAX_BATCH_PROCESSING_ATTEMPTS: u8 = 3;
/// Allows customisation of the above constants used in other sync methods such as BackFillSync.
pub trait BatchConfig {
/// The maximum batch download attempts.
fn max_batch_download_attempts() -> u8;
/// The max batch processing attempts.
fn max_batch_processing_attempts() -> u8;
}
pub struct RangeSyncBatchConfig {}
impl BatchConfig for RangeSyncBatchConfig {
fn max_batch_download_attempts() -> u8 {
MAX_BATCH_DOWNLOAD_ATTEMPTS
}
fn max_batch_processing_attempts() -> u8 {
MAX_BATCH_PROCESSING_ATTEMPTS
}
}
/// Error type of a batch in a wrong state.
// Such errors should never be encountered.
pub struct WrongState(pub(super) String);
pub struct WrongState(pub(crate) String);
/// Auxiliary type alias for readability.
type IsFailed = bool;
/// A segment of a chain.
pub struct BatchInfo<T: EthSpec> {
pub struct BatchInfo<T: EthSpec, B: BatchConfig = RangeSyncBatchConfig> {
/// Start slot of the batch.
start_slot: Slot,
/// End slot of the batch.
@@ -33,6 +52,8 @@ pub struct BatchInfo<T: EthSpec> {
failed_download_attempts: Vec<PeerId>,
/// State of the batch.
state: BatchState<T>,
/// Pin the generic
marker: std::marker::PhantomData<B>,
}
/// Current state of a batch
@@ -73,7 +94,7 @@ impl<T: EthSpec> BatchState<T> {
}
}
impl<T: EthSpec> BatchInfo<T> {
impl<T: EthSpec, B: BatchConfig> BatchInfo<T, B> {
/// Batches are downloaded excluding the first block of the epoch assuming it has already been
/// downloaded.
///
@@ -91,6 +112,7 @@ impl<T: EthSpec> BatchInfo<T> {
failed_processing_attempts: Vec::new(),
failed_download_attempts: Vec::new(),
state: BatchState::AwaitingDownload,
marker: std::marker::PhantomData,
}
}
@@ -120,6 +142,7 @@ impl<T: EthSpec> BatchInfo<T> {
false
}
/// Returns the peer that is currently responsible for progressing the state of the batch.
pub fn current_peer(&self) -> Option<&PeerId> {
match &self.state {
BatchState::AwaitingDownload | BatchState::Failed => None,
@@ -131,6 +154,7 @@ impl<T: EthSpec> BatchInfo<T> {
}
}
/// Returns a BlocksByRange request associated with the batch.
pub fn to_blocks_by_range_request(&self) -> BlocksByRangeRequest {
BlocksByRangeRequest {
start_slot: self.start_slot.into(),
@@ -192,7 +216,7 @@ impl<T: EthSpec> BatchInfo<T> {
// can be tried again
self.failed_download_attempts.push(peer);
self.state = if self.failed_download_attempts.len()
>= MAX_BATCH_DOWNLOAD_ATTEMPTS as usize
>= B::max_batch_download_attempts() as usize
{
BatchState::Failed
} else {
@@ -219,14 +243,21 @@ impl<T: EthSpec> BatchInfo<T> {
}
}
/// Mark the batch as failed and return whether we can attempt a re-download.
///
/// This can happen if a peer disconnects or some error occurred that was not the peers fault.
/// THe `mark_failed` parameter, when set to false, does not increment the failed attempts of
/// this batch and register the peer, rather attempts a re-download.
#[must_use = "Batch may have failed"]
pub fn download_failed(&mut self) -> Result<IsFailed, WrongState> {
pub fn download_failed(&mut self, mark_failed: bool) -> Result<IsFailed, WrongState> {
match self.state.poison() {
BatchState::Downloading(peer, _, _request_id) => {
// register the attempt and check if the batch can be tried again
self.failed_download_attempts.push(peer);
if mark_failed {
self.failed_download_attempts.push(peer);
}
self.state = if self.failed_download_attempts.len()
>= MAX_BATCH_DOWNLOAD_ATTEMPTS as usize
>= B::max_batch_download_attempts as usize
{
BatchState::Failed
} else {
@@ -294,7 +325,7 @@ impl<T: EthSpec> BatchInfo<T> {
// check if the batch can be downloaded again
if self.failed_processing_attempts.len()
>= MAX_BATCH_PROCESSING_ATTEMPTS as usize
>= B::max_batch_processing_attempts() as usize
{
BatchState::Failed
} else {
@@ -324,7 +355,7 @@ impl<T: EthSpec> BatchInfo<T> {
// check if the batch can be downloaded again
self.state = if self.failed_processing_attempts.len()
>= MAX_BATCH_PROCESSING_ATTEMPTS as usize
>= B::max_batch_processing_attempts() as usize
{
BatchState::Failed
} else {
@@ -365,7 +396,7 @@ impl Attempt {
}
}
impl<T: EthSpec> slog::KV for &mut BatchInfo<T> {
impl<T: EthSpec, B: BatchConfig> slog::KV for &mut BatchInfo<T, B> {
fn serialize(
&self,
record: &slog::Record,
@@ -375,7 +406,7 @@ impl<T: EthSpec> slog::KV for &mut BatchInfo<T> {
}
}
impl<T: EthSpec> slog::KV for BatchInfo<T> {
impl<T: EthSpec, B: BatchConfig> slog::KV for BatchInfo<T, B> {
fn serialize(
&self,
record: &slog::Record,

View File

@@ -181,7 +181,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// fail the batches
for id in batch_ids {
if let Some(batch) = self.batches.get_mut(&id) {
if batch.download_failed()? {
if batch.download_failed(true)? {
return Err(RemoveChain::ChainFailed(id));
}
self.retry_batch_download(network, id)?;
@@ -273,7 +273,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
}
}
/// Sends to process the batch with the given id.
/// Processes the batch with the given id.
/// The batch must exist and be ready for processing
fn process_batch(
&mut self,
@@ -794,7 +794,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
if let Some(active_requests) = self.peers.get_mut(peer_id) {
active_requests.remove(&batch_id);
}
if batch.download_failed()? {
if batch.download_failed(true)? {
return Err(RemoveChain::ChainFailed(batch_id));
}
self.retry_batch_download(network, batch_id)
@@ -837,7 +837,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
}
}
/// Requests the batch asigned to the given id from a given peer.
/// Requests the batch assigned to the given id from a given peer.
pub fn send_batch(
&mut self,
network: &mut SyncNetworkContext<T::EthSpec>,
@@ -883,7 +883,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
self.peers
.get_mut(&peer)
.map(|request| request.remove(&batch_id));
if batch.download_failed()? {
if batch.download_failed(true)? {
return Err(RemoveChain::ChainFailed(batch_id));
} else {
return self.retry_batch_download(network, batch_id);
@@ -990,7 +990,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// this batch could have been included already being an optimistic batch
match self.batches.entry(batch_id) {
Entry::Occupied(_) => {
// this batch doesn't need downlading, let this same function decide the next batch
// this batch doesn't need downloading, let this same function decide the next batch
self.to_be_downloaded += EPOCHS_PER_BATCH;
self.include_next_batch()
}

View File

@@ -7,7 +7,7 @@ mod chain_collection;
mod range;
mod sync_type;
pub use batch::BatchInfo;
pub use batch::{BatchConfig, BatchInfo, BatchState};
pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH};
pub use range::RangeSync;
pub use sync_type::RangeSyncType;

View File

@@ -39,7 +39,7 @@
//! Each chain is downloaded in batches of blocks. The batched blocks are processed sequentially
//! and further batches are requested as current blocks are being processed.
use super::chain::{ChainId, RemoveChain, SyncingChain};
use super::chain::{BatchId, ChainId, RemoveChain, SyncingChain};
use super::chain_collection::ChainCollection;
use super::sync_type::RangeSyncType;
use crate::beacon_processor::WorkEvent as BeaconWorkEvent;
@@ -194,34 +194,29 @@ impl<T: BeaconChainTypes> RangeSync<T> {
&mut self,
network: &mut SyncNetworkContext<T::EthSpec>,
peer_id: PeerId,
chain_id: ChainId,
batch_id: BatchId,
request_id: RequestId,
beacon_block: Option<SignedBeaconBlock<T::EthSpec>>,
) {
// get the chain and batch for which this response belongs
if let Some((chain_id, batch_id)) =
network.blocks_by_range_response(request_id, beacon_block.is_none())
{
// check if this chunk removes the chain
match self.chains.call_by_id(chain_id, |chain| {
chain.on_block_response(network, batch_id, &peer_id, request_id, beacon_block)
}) {
Ok((removed_chain, sync_type)) => {
if let Some((removed_chain, remove_reason)) = removed_chain {
self.on_chain_removed(
removed_chain,
sync_type,
remove_reason,
network,
"block response",
);
}
}
Err(_) => {
trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
// check if this chunk removes the chain
match self.chains.call_by_id(chain_id, |chain| {
chain.on_block_response(network, batch_id, &peer_id, request_id, beacon_block)
}) {
Ok((removed_chain, sync_type)) => {
if let Some((removed_chain, remove_reason)) = removed_chain {
self.on_chain_removed(
removed_chain,
sync_type,
remove_reason,
network,
"block response",
);
}
}
} else {
trace!(self.log, "Response/Error for non registered request"; "request_id" => request_id)
Err(_) => {
trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
}
}
}
@@ -298,31 +293,28 @@ impl<T: BeaconChainTypes> RangeSync<T> {
&mut self,
network: &mut SyncNetworkContext<T::EthSpec>,
peer_id: PeerId,
batch_id: BatchId,
chain_id: ChainId,
request_id: RequestId,
) {
// get the chain and batch for which this response belongs
if let Some((chain_id, batch_id)) = network.blocks_by_range_response(request_id, true) {
// check that this request is pending
match self.chains.call_by_id(chain_id, |chain| {
chain.inject_error(network, batch_id, &peer_id, request_id)
}) {
Ok((removed_chain, sync_type)) => {
if let Some((removed_chain, remove_reason)) = removed_chain {
self.on_chain_removed(
removed_chain,
sync_type,
remove_reason,
network,
"RPC error",
);
}
}
Err(_) => {
trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
// check that this request is pending
match self.chains.call_by_id(chain_id, |chain| {
chain.inject_error(network, batch_id, &peer_id, request_id)
}) {
Ok((removed_chain, sync_type)) => {
if let Some((removed_chain, remove_reason)) = removed_chain {
self.on_chain_removed(
removed_chain,
sync_type,
remove_reason,
network,
"RPC error",
);
}
}
} else {
trace!(self.log, "Response/Error for non registered request"; "request_id" => request_id)
Err(_) => {
trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
}
}
}