mirror of
https://github.com/sigp/lighthouse.git
synced 2026-03-20 05:14:35 +00:00
Custody backfill sync (#7907)
#7603 #### Custody backfill sync service Similar in many ways to the current backfill service. There may be ways to unify the two services. The difficulty there is that the current backfill service tightly couples blocks and their associated blobs/data columns. Any attempts to unify the two services should be left to a separate PR in my opinion. #### `SyncNeworkContext` `SyncNetworkContext` manages custody sync data columns by range requests separetly from other sync RPC requests. I think this is a nice separation considering that custody backfill is its own service. #### Data column import logic The import logic verifies KZG committments and that the data columns block root matches the block root in the nodes store before importing columns #### New channel to send messages to `SyncManager` Now external services can communicate with the `SyncManager`. In this PR this channel is used to trigger a custody sync. Alternatively we may be able to use the existing `mpsc` channel that the `SyncNetworkContext` uses to communicate with the `SyncManager`. I will spend some time reviewing this. Co-Authored-By: Eitan Seri-Levi <eserilev@ucsc.edu> Co-Authored-By: Eitan Seri- Levi <eserilev@gmail.com> Co-Authored-By: dapplion <35266934+dapplion@users.noreply.github.com>
This commit is contained in:
@@ -9,24 +9,27 @@
|
||||
//! sync as failed, log an error and attempt to retry once a new peer joins the node.
|
||||
|
||||
use crate::network_beacon_processor::ChainSegmentProcessId;
|
||||
use crate::sync::batch::{
|
||||
BatchConfig, BatchId, BatchInfo, BatchOperationOutcome, BatchProcessingResult, BatchState,
|
||||
};
|
||||
use crate::sync::block_sidecar_coupling::CouplingError;
|
||||
use crate::sync::manager::BatchProcessResult;
|
||||
use crate::sync::network_context::{
|
||||
RangeRequestId, RpcRequestSendError, RpcResponseError, SyncNetworkContext,
|
||||
};
|
||||
use crate::sync::range_sync::{
|
||||
BatchConfig, BatchId, BatchInfo, BatchOperationOutcome, BatchProcessingResult, BatchState,
|
||||
};
|
||||
use beacon_chain::block_verification_types::RpcBlock;
|
||||
use beacon_chain::{BeaconChain, BeaconChainTypes};
|
||||
use lighthouse_network::service::api_types::Id;
|
||||
use lighthouse_network::types::{BackFillState, NetworkGlobals};
|
||||
use lighthouse_network::{PeerAction, PeerId};
|
||||
use logging::crit;
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::collections::{
|
||||
HashSet,
|
||||
btree_map::{BTreeMap, Entry},
|
||||
};
|
||||
use std::hash::{Hash, Hasher};
|
||||
use std::marker::PhantomData;
|
||||
use std::sync::Arc;
|
||||
use tracing::{debug, error, info, warn};
|
||||
use types::{ColumnIndex, Epoch, EthSpec};
|
||||
@@ -49,21 +52,27 @@ const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 10;
|
||||
/// after `MAX_BATCH_PROCESSING_ATTEMPTS` times, it is considered faulty.
|
||||
const MAX_BATCH_PROCESSING_ATTEMPTS: u8 = 10;
|
||||
|
||||
/// Custom configuration for the batch object.
|
||||
struct BackFillBatchConfig {}
|
||||
type RpcBlocks<E> = Vec<RpcBlock<E>>;
|
||||
|
||||
impl BatchConfig for BackFillBatchConfig {
|
||||
type BackFillBatchInfo<E> = BatchInfo<E, BackFillBatchConfig<E>, RpcBlocks<E>>;
|
||||
|
||||
type BackFillSyncBatches<E> = BTreeMap<BatchId, BackFillBatchInfo<E>>;
|
||||
|
||||
/// Custom configuration for the batch object.
|
||||
struct BackFillBatchConfig<E: EthSpec> {
|
||||
marker: PhantomData<E>,
|
||||
}
|
||||
|
||||
impl<E: EthSpec> BatchConfig for BackFillBatchConfig<E> {
|
||||
fn max_batch_download_attempts() -> u8 {
|
||||
MAX_BATCH_DOWNLOAD_ATTEMPTS
|
||||
}
|
||||
fn max_batch_processing_attempts() -> u8 {
|
||||
MAX_BATCH_PROCESSING_ATTEMPTS
|
||||
}
|
||||
fn batch_attempt_hash<E: EthSpec>(blocks: &[RpcBlock<E>]) -> u64 {
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::{Hash, Hasher};
|
||||
fn batch_attempt_hash<D: Hash>(data: &D) -> u64 {
|
||||
let mut hasher = DefaultHasher::new();
|
||||
blocks.hash(&mut hasher);
|
||||
data.hash(&mut hasher);
|
||||
hasher.finish()
|
||||
}
|
||||
}
|
||||
@@ -121,7 +130,7 @@ pub struct BackFillSync<T: BeaconChainTypes> {
|
||||
last_batch_downloaded: bool,
|
||||
|
||||
/// Sorted map of batches undergoing some kind of processing.
|
||||
batches: BTreeMap<BatchId, BatchInfo<T::EthSpec, BackFillBatchConfig>>,
|
||||
batches: BackFillSyncBatches<T::EthSpec>,
|
||||
|
||||
/// The current processing batch, if any.
|
||||
current_processing_batch: Option<BatchId>,
|
||||
@@ -349,7 +358,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
// reasons. Check that this block belongs to the expected peer
|
||||
// TODO(das): removed peer_id matching as the node may request a different peer for data
|
||||
// columns.
|
||||
if !batch.is_expecting_block(&request_id) {
|
||||
if !batch.is_expecting_request_id(&request_id) {
|
||||
return Ok(());
|
||||
}
|
||||
debug!(batch_epoch = %batch_id, error = ?err, "Batch download failed");
|
||||
@@ -393,12 +402,13 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
// sending an error /timeout) if the peer is removed from the chain for other
|
||||
// reasons. Check that this block belongs to the expected peer, and that the
|
||||
// request_id matches
|
||||
if !batch.is_expecting_block(&request_id) {
|
||||
if !batch.is_expecting_request_id(&request_id) {
|
||||
return Ok(ProcessResult::Successful);
|
||||
}
|
||||
let received = blocks.len();
|
||||
|
||||
match batch.download_completed(blocks, *peer_id) {
|
||||
Ok(received) => {
|
||||
Ok(_) => {
|
||||
let awaiting_batches =
|
||||
self.processing_target.saturating_sub(batch_id) / BACKFILL_EPOCHS_PER_BATCH;
|
||||
debug!(
|
||||
@@ -1050,7 +1060,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
// only request batches up to the buffer size limit
|
||||
// NOTE: we don't count batches in the AwaitingValidation state, to prevent stalling sync
|
||||
// if the current processing window is contained in a long range of skip slots.
|
||||
let in_buffer = |batch: &BatchInfo<T::EthSpec, BackFillBatchConfig>| {
|
||||
let in_buffer = |batch: &BackFillBatchInfo<T::EthSpec>| {
|
||||
matches!(
|
||||
batch.state(),
|
||||
BatchState::Downloading(..) | BatchState::AwaitingProcessing(..)
|
||||
|
||||
@@ -2,29 +2,28 @@ use beacon_chain::block_verification_types::RpcBlock;
|
||||
use derivative::Derivative;
|
||||
use lighthouse_network::PeerId;
|
||||
use lighthouse_network::rpc::methods::BlocksByRangeRequest;
|
||||
use lighthouse_network::rpc::methods::DataColumnsByRangeRequest;
|
||||
use lighthouse_network::service::api_types::Id;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
use std::hash::{Hash, Hasher};
|
||||
use std::hash::Hash;
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::Sub;
|
||||
use std::time::{Duration, Instant};
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
use strum::Display;
|
||||
use types::{Epoch, EthSpec, Slot};
|
||||
use types::Slot;
|
||||
use types::{DataColumnSidecarList, Epoch, EthSpec};
|
||||
|
||||
/// The number of times to retry a batch before it is considered failed.
|
||||
const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 5;
|
||||
|
||||
/// Invalid batches are attempted to be re-downloaded from other peers. If a batch cannot be processed
|
||||
/// after `MAX_BATCH_PROCESSING_ATTEMPTS` times, it is considered faulty.
|
||||
const MAX_BATCH_PROCESSING_ATTEMPTS: u8 = 3;
|
||||
pub type BatchId = Epoch;
|
||||
|
||||
/// Type of expected batch.
|
||||
#[derive(Debug, Copy, Clone, Display)]
|
||||
#[derive(Debug, Clone, Display)]
|
||||
#[strum(serialize_all = "snake_case")]
|
||||
pub enum ByRangeRequestType {
|
||||
BlocksAndColumns,
|
||||
BlocksAndBlobs,
|
||||
Blocks,
|
||||
Columns(HashSet<u64>),
|
||||
}
|
||||
|
||||
/// Allows customisation of the above constants used in other sync methods such as BackFillSync.
|
||||
@@ -60,28 +59,10 @@ pub trait BatchConfig {
|
||||
/// Note that simpler hashing functions considered in the past (hash of first block, hash of last
|
||||
/// block, number of received blocks) are not good enough to differentiate attempts. For this
|
||||
/// reason, we hash the complete set of blocks both in RangeSync and BackFillSync.
|
||||
fn batch_attempt_hash<E: EthSpec>(blocks: &[RpcBlock<E>]) -> u64;
|
||||
fn batch_attempt_hash<D: Hash>(data: &D) -> u64;
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct RangeSyncBatchConfig {}
|
||||
|
||||
impl BatchConfig for RangeSyncBatchConfig {
|
||||
fn max_batch_download_attempts() -> u8 {
|
||||
MAX_BATCH_DOWNLOAD_ATTEMPTS
|
||||
}
|
||||
fn max_batch_processing_attempts() -> u8 {
|
||||
MAX_BATCH_PROCESSING_ATTEMPTS
|
||||
}
|
||||
fn batch_attempt_hash<E: EthSpec>(blocks: &[RpcBlock<E>]) -> u64 {
|
||||
let mut hasher = std::collections::hash_map::DefaultHasher::new();
|
||||
blocks.hash(&mut hasher);
|
||||
hasher.finish()
|
||||
}
|
||||
}
|
||||
|
||||
/// Error type of a batch in a wrong state.
|
||||
// Such errors should never be encountered.
|
||||
pub struct WrongState(pub(crate) String);
|
||||
|
||||
/// After batch operations, we use this to communicate whether a batch can continue or not
|
||||
@@ -100,28 +81,30 @@ pub enum BatchProcessingResult {
|
||||
#[derive(Derivative)]
|
||||
#[derivative(Debug)]
|
||||
/// A segment of a chain.
|
||||
pub struct BatchInfo<E: EthSpec, B: BatchConfig = RangeSyncBatchConfig> {
|
||||
pub struct BatchInfo<E: EthSpec, B: BatchConfig, D: Hash> {
|
||||
/// Start slot of the batch.
|
||||
start_slot: Slot,
|
||||
/// End slot of the batch.
|
||||
end_slot: Slot,
|
||||
/// The `Attempts` that have been made and failed to send us this batch.
|
||||
failed_processing_attempts: Vec<Attempt>,
|
||||
failed_processing_attempts: Vec<Attempt<D>>,
|
||||
/// Number of processing attempts that have failed but we do not count.
|
||||
non_faulty_processing_attempts: u8,
|
||||
/// The number of download retries this batch has undergone due to a failed request.
|
||||
failed_download_attempts: Vec<Option<PeerId>>,
|
||||
/// State of the batch.
|
||||
state: BatchState<E>,
|
||||
state: BatchState<D>,
|
||||
/// Whether this batch contains all blocks or all blocks and blobs.
|
||||
batch_type: ByRangeRequestType,
|
||||
/// Pin the generic
|
||||
#[derivative(Debug = "ignore")]
|
||||
marker: std::marker::PhantomData<B>,
|
||||
marker: std::marker::PhantomData<(E, B)>,
|
||||
}
|
||||
|
||||
impl<E: EthSpec, B: BatchConfig> fmt::Display for BatchInfo<E, B> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
impl<E: EthSpec, B: BatchConfig, D: std::fmt::Debug + Hash> std::fmt::Display
|
||||
for BatchInfo<E, B, D>
|
||||
{
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"Start Slot: {}, End Slot: {}, State: {}",
|
||||
@@ -132,21 +115,21 @@ impl<E: EthSpec, B: BatchConfig> fmt::Display for BatchInfo<E, B> {
|
||||
|
||||
#[derive(Display)]
|
||||
/// Current state of a batch
|
||||
pub enum BatchState<E: EthSpec> {
|
||||
pub enum BatchState<D: Hash> {
|
||||
/// The batch has failed either downloading or processing, but can be requested again.
|
||||
AwaitingDownload,
|
||||
/// The batch is being downloaded.
|
||||
Downloading(Id),
|
||||
/// The batch has been completely downloaded and is ready for processing.
|
||||
AwaitingProcessing(PeerId, Vec<RpcBlock<E>>, Instant),
|
||||
AwaitingProcessing(PeerId, D, Instant),
|
||||
/// The batch is being processed.
|
||||
Processing(Attempt),
|
||||
Processing(Attempt<D>),
|
||||
/// The batch was successfully processed and is waiting to be validated.
|
||||
///
|
||||
/// It is not sufficient to process a batch successfully to consider it correct. This is
|
||||
/// because batches could be erroneously empty, or incomplete. Therefore, a batch is considered
|
||||
/// valid, only if the next sequential batch imports at least a block.
|
||||
AwaitingValidation(Attempt),
|
||||
AwaitingValidation(Attempt<D>),
|
||||
/// Intermediate state for inner state handling.
|
||||
Poisoned,
|
||||
/// The batch has maxed out the allowed attempts for either downloading or processing. It
|
||||
@@ -154,14 +137,14 @@ pub enum BatchState<E: EthSpec> {
|
||||
Failed,
|
||||
}
|
||||
|
||||
impl<E: EthSpec> BatchState<E> {
|
||||
impl<D: Hash> BatchState<D> {
|
||||
/// Helper function for poisoning a state.
|
||||
pub fn poison(&mut self) -> BatchState<E> {
|
||||
pub fn poison(&mut self) -> BatchState<D> {
|
||||
std::mem::replace(self, BatchState::Poisoned)
|
||||
}
|
||||
}
|
||||
|
||||
impl<E: EthSpec, B: BatchConfig> BatchInfo<E, B> {
|
||||
impl<E: EthSpec, B: BatchConfig, D: Hash> BatchInfo<E, B, D> {
|
||||
/// Batches are downloaded excluding the first block of the epoch assuming it has already been
|
||||
/// downloaded.
|
||||
///
|
||||
@@ -178,13 +161,13 @@ impl<E: EthSpec, B: BatchConfig> BatchInfo<E, B> {
|
||||
pub fn new(start_epoch: &Epoch, num_of_epochs: u64, batch_type: ByRangeRequestType) -> Self {
|
||||
let start_slot = start_epoch.start_slot(E::slots_per_epoch());
|
||||
let end_slot = start_slot + num_of_epochs * E::slots_per_epoch();
|
||||
BatchInfo {
|
||||
Self {
|
||||
start_slot,
|
||||
end_slot,
|
||||
failed_processing_attempts: Vec::new(),
|
||||
failed_download_attempts: Vec::new(),
|
||||
non_faulty_processing_attempts: 0,
|
||||
state: BatchState::AwaitingDownload,
|
||||
state: BatchState::<D>::AwaitingDownload,
|
||||
batch_type,
|
||||
marker: std::marker::PhantomData,
|
||||
}
|
||||
@@ -208,8 +191,8 @@ impl<E: EthSpec, B: BatchConfig> BatchInfo<E, B> {
|
||||
peers
|
||||
}
|
||||
|
||||
/// Verifies if an incoming block belongs to this batch.
|
||||
pub fn is_expecting_block(&self, request_id: &Id) -> bool {
|
||||
/// Verifies if an incoming request id to this batch.
|
||||
pub fn is_expecting_request_id(&self, request_id: &Id) -> bool {
|
||||
if let BatchState::Downloading(expected_id) = &self.state {
|
||||
return expected_id == request_id;
|
||||
}
|
||||
@@ -227,30 +210,6 @@ impl<E: EthSpec, B: BatchConfig> BatchInfo<E, B> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the count of stored pending blocks if in awaiting processing state
|
||||
pub fn pending_blocks(&self) -> usize {
|
||||
match &self.state {
|
||||
BatchState::AwaitingProcessing(_, blocks, _) => blocks.len(),
|
||||
BatchState::AwaitingDownload
|
||||
| BatchState::Downloading { .. }
|
||||
| BatchState::Processing { .. }
|
||||
| BatchState::AwaitingValidation { .. }
|
||||
| BatchState::Poisoned
|
||||
| BatchState::Failed => 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a BlocksByRange request associated with the batch.
|
||||
pub fn to_blocks_by_range_request(&self) -> (BlocksByRangeRequest, ByRangeRequestType) {
|
||||
(
|
||||
BlocksByRangeRequest::new(
|
||||
self.start_slot.into(),
|
||||
self.end_slot.sub(self.start_slot).into(),
|
||||
),
|
||||
self.batch_type,
|
||||
)
|
||||
}
|
||||
|
||||
/// After different operations over a batch, this could be in a state that allows it to
|
||||
/// continue, or in failed state. When the batch has failed, we check if it did mainly due to
|
||||
/// processing failures. In this case the batch is considered failed and faulty.
|
||||
@@ -265,27 +224,22 @@ impl<E: EthSpec, B: BatchConfig> BatchInfo<E, B> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn state(&self) -> &BatchState<E> {
|
||||
pub fn state(&self) -> &BatchState<D> {
|
||||
&self.state
|
||||
}
|
||||
|
||||
pub fn attempts(&self) -> &[Attempt] {
|
||||
pub fn attempts(&self) -> &[Attempt<D>] {
|
||||
&self.failed_processing_attempts
|
||||
}
|
||||
|
||||
/// Marks the batch as ready to be processed if the blocks are in the range. The number of
|
||||
/// received blocks is returned, or the wrong batch end on failure
|
||||
/// Marks the batch as ready to be processed if the data columns are in the range. The number of
|
||||
/// received columns is returned, or the wrong batch end on failure
|
||||
#[must_use = "Batch may have failed"]
|
||||
pub fn download_completed(
|
||||
&mut self,
|
||||
blocks: Vec<RpcBlock<E>>,
|
||||
peer: PeerId,
|
||||
) -> Result<usize /* Received blocks */, WrongState> {
|
||||
pub fn download_completed(&mut self, data_columns: D, peer: PeerId) -> Result<(), WrongState> {
|
||||
match self.state.poison() {
|
||||
BatchState::Downloading(_) => {
|
||||
let received = blocks.len();
|
||||
self.state = BatchState::AwaitingProcessing(peer, blocks, Instant::now());
|
||||
Ok(received)
|
||||
self.state = BatchState::AwaitingProcessing(peer, data_columns, Instant::now());
|
||||
Ok(())
|
||||
}
|
||||
BatchState::Poisoned => unreachable!("Poisoned batch"),
|
||||
other => {
|
||||
@@ -376,17 +330,17 @@ impl<E: EthSpec, B: BatchConfig> BatchInfo<E, B> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start_processing(&mut self) -> Result<(Vec<RpcBlock<E>>, Duration), WrongState> {
|
||||
pub fn start_processing(&mut self) -> Result<(D, Duration), WrongState> {
|
||||
match self.state.poison() {
|
||||
BatchState::AwaitingProcessing(peer, blocks, start_instant) => {
|
||||
self.state = BatchState::Processing(Attempt::new::<B, E>(peer, &blocks));
|
||||
Ok((blocks, start_instant.elapsed()))
|
||||
BatchState::AwaitingProcessing(peer, data_columns, start_instant) => {
|
||||
self.state = BatchState::Processing(Attempt::new::<B>(peer, &data_columns));
|
||||
Ok((data_columns, start_instant.elapsed()))
|
||||
}
|
||||
BatchState::Poisoned => unreachable!("Poisoned batch"),
|
||||
other => {
|
||||
self.state = other;
|
||||
Err(WrongState(format!(
|
||||
"Starting procesing batch in wrong state {:?}",
|
||||
"Starting processing batch in wrong state {:?}",
|
||||
self.state
|
||||
)))
|
||||
}
|
||||
@@ -466,37 +420,86 @@ impl<E: EthSpec, B: BatchConfig> BatchInfo<E, B> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents a peer's attempt and providing the result for this batch.
|
||||
///
|
||||
/// Invalid attempts will downscore a peer.
|
||||
#[derive(PartialEq, Debug)]
|
||||
pub struct Attempt {
|
||||
// BatchInfo implementations for RangeSync
|
||||
impl<E: EthSpec, B: BatchConfig> BatchInfo<E, B, Vec<RpcBlock<E>>> {
|
||||
/// Returns a BlocksByRange request associated with the batch.
|
||||
pub fn to_blocks_by_range_request(&self) -> (BlocksByRangeRequest, ByRangeRequestType) {
|
||||
(
|
||||
BlocksByRangeRequest::new(
|
||||
self.start_slot.into(),
|
||||
self.end_slot.sub(self.start_slot).into(),
|
||||
),
|
||||
self.batch_type.clone(),
|
||||
)
|
||||
}
|
||||
|
||||
/// Returns the count of stored pending blocks if in awaiting processing state
|
||||
pub fn pending_blocks(&self) -> usize {
|
||||
match &self.state {
|
||||
BatchState::AwaitingProcessing(_, blocks, _) => blocks.len(),
|
||||
BatchState::AwaitingDownload
|
||||
| BatchState::Downloading { .. }
|
||||
| BatchState::Processing { .. }
|
||||
| BatchState::AwaitingValidation { .. }
|
||||
| BatchState::Poisoned
|
||||
| BatchState::Failed => 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// BatchInfo implementation for CustodyBackFillSync
|
||||
impl<E: EthSpec, B: BatchConfig> BatchInfo<E, B, DataColumnSidecarList<E>> {
|
||||
/// Returns a DataColumnsByRange request associated with the batch.
|
||||
pub fn to_data_columns_by_range_request(
|
||||
&self,
|
||||
) -> Result<DataColumnsByRangeRequest, WrongState> {
|
||||
match &self.batch_type {
|
||||
ByRangeRequestType::Columns(columns) => Ok(DataColumnsByRangeRequest {
|
||||
start_slot: self.start_slot.into(),
|
||||
count: self.end_slot.sub(self.start_slot).into(),
|
||||
columns: columns.clone().into_iter().collect(),
|
||||
}),
|
||||
_ => Err(WrongState(
|
||||
"Custody backfill sync can only make data columns by range requests.".to_string(),
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Attempt<D: Hash> {
|
||||
/// The peer that made the attempt.
|
||||
pub peer_id: PeerId,
|
||||
/// The hash of the blocks of the attempt.
|
||||
pub hash: u64,
|
||||
/// Pin the generic.
|
||||
marker: PhantomData<D>,
|
||||
}
|
||||
|
||||
impl Attempt {
|
||||
fn new<B: BatchConfig, E: EthSpec>(peer_id: PeerId, blocks: &[RpcBlock<E>]) -> Self {
|
||||
let hash = B::batch_attempt_hash(blocks);
|
||||
Attempt { peer_id, hash }
|
||||
impl<D: Hash> Attempt<D> {
|
||||
fn new<B: BatchConfig>(peer_id: PeerId, data: &D) -> Self {
|
||||
let hash = B::batch_attempt_hash(data);
|
||||
Attempt {
|
||||
peer_id,
|
||||
hash,
|
||||
marker: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<E: EthSpec> std::fmt::Debug for BatchState<E> {
|
||||
impl<D: Hash> std::fmt::Debug for BatchState<D> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
BatchState::Processing(Attempt { peer_id, hash: _ }) => {
|
||||
BatchState::Processing(Attempt { peer_id, .. }) => {
|
||||
write!(f, "Processing({})", peer_id)
|
||||
}
|
||||
BatchState::AwaitingValidation(Attempt { peer_id, hash: _ }) => {
|
||||
BatchState::AwaitingValidation(Attempt { peer_id, .. }) => {
|
||||
write!(f, "AwaitingValidation({})", peer_id)
|
||||
}
|
||||
BatchState::AwaitingDownload => f.write_str("AwaitingDownload"),
|
||||
BatchState::Failed => f.write_str("Failed"),
|
||||
BatchState::AwaitingProcessing(peer, blocks, _) => {
|
||||
write!(f, "AwaitingProcessing({}, {} blocks)", peer, blocks.len())
|
||||
BatchState::AwaitingProcessing(peer, ..) => {
|
||||
write!(f, "AwaitingProcessing({})", peer)
|
||||
}
|
||||
BatchState::Downloading(request_id) => {
|
||||
write!(f, "Downloading({})", request_id)
|
||||
@@ -506,7 +509,7 @@ impl<E: EthSpec> std::fmt::Debug for BatchState<E> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<E: EthSpec> BatchState<E> {
|
||||
impl<D: Hash> BatchState<D> {
|
||||
/// Creates a character representation/visualization for the batch state to display in logs for quicker and
|
||||
/// easier recognition
|
||||
fn visualize(&self) -> char {
|
||||
@@ -36,7 +36,7 @@ pub struct RangeBlockComponentsRequest<E: EthSpec> {
|
||||
pub(crate) request_span: Span,
|
||||
}
|
||||
|
||||
enum ByRangeRequest<I: PartialEq + std::fmt::Display, T> {
|
||||
pub enum ByRangeRequest<I: PartialEq + std::fmt::Display, T> {
|
||||
Active(I),
|
||||
Complete(T),
|
||||
}
|
||||
@@ -435,7 +435,7 @@ impl<E: EthSpec> RangeBlockComponentsRequest<E> {
|
||||
}
|
||||
|
||||
impl<I: PartialEq + std::fmt::Display, T> ByRangeRequest<I, T> {
|
||||
fn finish(&mut self, id: I, data: T) -> Result<(), String> {
|
||||
pub fn finish(&mut self, id: I, data: T) -> Result<(), String> {
|
||||
match self {
|
||||
Self::Active(expected_id) => {
|
||||
if expected_id != &id {
|
||||
@@ -448,7 +448,7 @@ impl<I: PartialEq + std::fmt::Display, T> ByRangeRequest<I, T> {
|
||||
}
|
||||
}
|
||||
|
||||
fn to_finished(&self) -> Option<&T> {
|
||||
pub fn to_finished(&self) -> Option<&T> {
|
||||
match self {
|
||||
Self::Active(_) => None,
|
||||
Self::Complete(data) => Some(data),
|
||||
@@ -467,7 +467,7 @@ mod tests {
|
||||
PeerId,
|
||||
service::api_types::{
|
||||
BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId,
|
||||
DataColumnsByRangeRequestId, Id, RangeRequestId,
|
||||
DataColumnsByRangeRequestId, DataColumnsByRangeRequester, Id, RangeRequestId,
|
||||
},
|
||||
};
|
||||
use rand::SeedableRng;
|
||||
@@ -501,7 +501,7 @@ mod tests {
|
||||
|
||||
fn columns_id(
|
||||
id: Id,
|
||||
parent_request_id: ComponentsByRangeRequestId,
|
||||
parent_request_id: DataColumnsByRangeRequester,
|
||||
) -> DataColumnsByRangeRequestId {
|
||||
DataColumnsByRangeRequestId {
|
||||
id,
|
||||
@@ -598,7 +598,15 @@ mod tests {
|
||||
let columns_req_id = expects_custody_columns
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, column)| (columns_id(i as Id, components_id), vec![*column]))
|
||||
.map(|(i, column)| {
|
||||
(
|
||||
columns_id(
|
||||
i as Id,
|
||||
DataColumnsByRangeRequester::ComponentsByRange(components_id),
|
||||
),
|
||||
vec![*column],
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let mut info = RangeBlockComponentsRequest::<E>::new(
|
||||
blocks_req_id,
|
||||
@@ -657,7 +665,15 @@ mod tests {
|
||||
let columns_req_id = batched_column_requests
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, columns)| (columns_id(i as Id, components_id), columns.clone()))
|
||||
.map(|(i, columns)| {
|
||||
(
|
||||
columns_id(
|
||||
i as Id,
|
||||
DataColumnsByRangeRequester::ComponentsByRange(components_id),
|
||||
),
|
||||
columns.clone(),
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let mut info = RangeBlockComponentsRequest::<E>::new(
|
||||
@@ -738,7 +754,15 @@ mod tests {
|
||||
let columns_req_id = expected_custody_columns
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, column)| (columns_id(i as Id, components_id), vec![*column]))
|
||||
.map(|(i, column)| {
|
||||
(
|
||||
columns_id(
|
||||
i as Id,
|
||||
DataColumnsByRangeRequester::ComponentsByRange(components_id),
|
||||
),
|
||||
vec![*column],
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let mut info = RangeBlockComponentsRequest::<E>::new(
|
||||
blocks_req_id,
|
||||
@@ -816,7 +840,15 @@ mod tests {
|
||||
let columns_req_id = expected_custody_columns
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, column)| (columns_id(i as Id, components_id), vec![*column]))
|
||||
.map(|(i, column)| {
|
||||
(
|
||||
columns_id(
|
||||
i as Id,
|
||||
DataColumnsByRangeRequester::ComponentsByRange(components_id),
|
||||
),
|
||||
vec![*column],
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let mut info = RangeBlockComponentsRequest::<E>::new(
|
||||
blocks_req_id,
|
||||
@@ -852,7 +884,10 @@ mod tests {
|
||||
assert!(result.is_err());
|
||||
|
||||
// AND: We retry with a new peer for the failed column
|
||||
let new_columns_req_id = columns_id(10 as Id, components_id);
|
||||
let new_columns_req_id = columns_id(
|
||||
10 as Id,
|
||||
DataColumnsByRangeRequester::ComponentsByRange(components_id),
|
||||
);
|
||||
let failed_column_requests = vec![(new_columns_req_id, vec![2])];
|
||||
info.reinsert_failed_column_requests(failed_column_requests)
|
||||
.unwrap();
|
||||
@@ -898,7 +933,15 @@ mod tests {
|
||||
let columns_req_id = expected_custody_columns
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, column)| (columns_id(i as Id, components_id), vec![*column]))
|
||||
.map(|(i, column)| {
|
||||
(
|
||||
columns_id(
|
||||
i as Id,
|
||||
DataColumnsByRangeRequester::ComponentsByRange(components_id),
|
||||
),
|
||||
vec![*column],
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let mut info = RangeBlockComponentsRequest::<E>::new(
|
||||
blocks_req_id,
|
||||
|
||||
1126
beacon_node/network/src/sync/custody_backfill_sync/mod.rs
Normal file
1126
beacon_node/network/src/sync/custody_backfill_sync/mod.rs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -46,7 +46,8 @@ use crate::status::ToStatusMessage;
|
||||
use crate::sync::block_lookups::{
|
||||
BlobRequestState, BlockComponent, BlockRequestState, CustodyRequestState, DownloadResult,
|
||||
};
|
||||
use crate::sync::network_context::PeerGroup;
|
||||
use crate::sync::custody_backfill_sync::CustodyBackFillSync;
|
||||
use crate::sync::network_context::{PeerGroup, RpcResponseResult};
|
||||
use beacon_chain::block_verification_types::AsBlock;
|
||||
use beacon_chain::validator_monitor::timestamp_now;
|
||||
use beacon_chain::{
|
||||
@@ -56,14 +57,16 @@ use futures::StreamExt;
|
||||
use lighthouse_network::SyncInfo;
|
||||
use lighthouse_network::rpc::RPCError;
|
||||
use lighthouse_network::service::api_types::{
|
||||
BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, CustodyRequester,
|
||||
DataColumnsByRangeRequestId, DataColumnsByRootRequestId, DataColumnsByRootRequester, Id,
|
||||
SingleLookupReqId, SyncRequestId,
|
||||
BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId,
|
||||
CustodyBackFillBatchRequestId, CustodyBackfillBatchId, CustodyRequester,
|
||||
DataColumnsByRangeRequestId, DataColumnsByRangeRequester, DataColumnsByRootRequestId,
|
||||
DataColumnsByRootRequester, Id, SingleLookupReqId, SyncRequestId,
|
||||
};
|
||||
use lighthouse_network::types::{NetworkGlobals, SyncState};
|
||||
use lighthouse_network::{PeerAction, PeerId};
|
||||
use logging::crit;
|
||||
use lru_cache::LRUTimeCache;
|
||||
use slot_clock::SlotClock;
|
||||
use std::ops::Sub;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
@@ -158,6 +161,12 @@ pub enum SyncMessage<E: EthSpec> {
|
||||
result: BatchProcessResult,
|
||||
},
|
||||
|
||||
/// A custody batch has been processed by the processor thread.
|
||||
CustodyBatchProcessed {
|
||||
batch_id: CustodyBackfillBatchId,
|
||||
result: CustodyBatchProcessResult,
|
||||
},
|
||||
|
||||
/// Block processed
|
||||
BlockComponentProcessed {
|
||||
process_type: BlockProcessType,
|
||||
@@ -209,6 +218,19 @@ pub enum BatchProcessResult {
|
||||
NonFaultyFailure,
|
||||
}
|
||||
|
||||
/// The result of processing multiple data columns.
|
||||
#[derive(Debug)]
|
||||
pub enum CustodyBatchProcessResult {
|
||||
/// The custody batch was completed successfully. It carries whether the sent batch contained data columns.
|
||||
Success {
|
||||
#[allow(dead_code)]
|
||||
sent_columns: usize,
|
||||
imported_columns: usize,
|
||||
},
|
||||
/// The custody batch processing failed.
|
||||
Error { peer_action: Option<PeerAction> },
|
||||
}
|
||||
|
||||
/// The primary object for handling and driving all the current syncing logic. It maintains the
|
||||
/// current state of the syncing process, the number of useful peers, downloaded blocks and
|
||||
/// controls the logic behind both the long-range (batch) sync and the on-going potential parent
|
||||
@@ -229,6 +251,9 @@ pub struct SyncManager<T: BeaconChainTypes> {
|
||||
/// Backfill syncing.
|
||||
backfill_sync: BackFillSync<T>,
|
||||
|
||||
/// Custody syncing.
|
||||
custody_backfill_sync: CustodyBackFillSync<T>,
|
||||
|
||||
block_lookups: BlockLookups<T>,
|
||||
/// debounce duplicated `UnknownBlockHashFromAttestation` for the same root peer tuple. A peer
|
||||
/// may forward us thousands of a attestations, each one triggering an individual event. Only
|
||||
@@ -288,7 +313,8 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
fork_context.clone(),
|
||||
),
|
||||
range_sync: RangeSync::new(beacon_chain.clone()),
|
||||
backfill_sync: BackFillSync::new(beacon_chain.clone(), network_globals),
|
||||
backfill_sync: BackFillSync::new(beacon_chain.clone(), network_globals.clone()),
|
||||
custody_backfill_sync: CustodyBackFillSync::new(beacon_chain.clone(), network_globals),
|
||||
block_lookups: BlockLookups::new(),
|
||||
notified_unknown_roots: LRUTimeCache::new(Duration::from_secs(
|
||||
NOTIFIED_UNKNOWN_ROOT_EXPIRY_SECONDS,
|
||||
@@ -549,6 +575,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
// inform the backfill sync that a new synced peer has joined us.
|
||||
if new_state.is_synced() {
|
||||
self.backfill_sync.fully_synced_peer_joined();
|
||||
self.custody_backfill_sync.fully_synced_peer_joined();
|
||||
}
|
||||
}
|
||||
is_connected
|
||||
@@ -558,17 +585,18 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Updates the global sync state, optionally instigating or pausing a backfill sync as well as
|
||||
/// Updates the global sync state, optionally instigating or pausing a backfill or custody sync as well as
|
||||
/// logging any changes.
|
||||
///
|
||||
/// The logic for which sync should be running is as follows:
|
||||
/// - If there is a range-sync running (or required) pause any backfill and let range-sync
|
||||
/// - If there is a range-sync running (or required) pause any backfill/custody sync and let range-sync
|
||||
/// complete.
|
||||
/// - If there is no current range sync, check for any requirement to backfill and either
|
||||
/// start/resume a backfill sync if required. The global state will be BackFillSync if a
|
||||
/// backfill sync is running.
|
||||
/// - If there is no range sync and no required backfill and we have synced up to the currently
|
||||
/// known peers, we consider ourselves synced.
|
||||
/// - If there is no range sync and no required backfill we check if we need to execute a custody sync.
|
||||
fn update_sync_state(&mut self) {
|
||||
let new_state: SyncState = match self.range_sync.state() {
|
||||
Err(e) => {
|
||||
@@ -624,15 +652,51 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
error!(error = ?e, "Backfill sync failed to start");
|
||||
}
|
||||
}
|
||||
|
||||
// If backfill is complete, check if we have a pending custody backfill to complete
|
||||
let anchor_info = self.chain.store.get_anchor_info();
|
||||
if anchor_info.block_backfill_complete(self.chain.genesis_backfill_slot) {
|
||||
match self.custody_backfill_sync.start(&mut self.network) {
|
||||
Ok(SyncStart::Syncing {
|
||||
completed,
|
||||
remaining,
|
||||
}) => {
|
||||
sync_state = SyncState::CustodyBackFillSyncing {
|
||||
completed,
|
||||
remaining,
|
||||
};
|
||||
}
|
||||
Ok(SyncStart::NotSyncing) => {} // Ignore updating the state if custody sync state didn't start.
|
||||
Err(e) => {
|
||||
use crate::sync::custody_backfill_sync::CustodyBackfillError;
|
||||
|
||||
match &e {
|
||||
CustodyBackfillError::BatchDownloadFailed(_)
|
||||
| CustodyBackfillError::BatchProcessingFailed(_) => {
|
||||
debug!(error=?e, "Custody backfill batch processing or downloading failed");
|
||||
}
|
||||
CustodyBackfillError::BatchInvalidState(_, reason) => {
|
||||
error!(error=?e, reason, "Custody backfill sync failed due to invalid batch state")
|
||||
}
|
||||
CustodyBackfillError::InvalidSyncState(reason) => {
|
||||
error!(error=?e, reason, "Custody backfill sync failed due to invalid sync state")
|
||||
}
|
||||
CustodyBackfillError::Paused => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Return the sync state if backfilling is not required.
|
||||
sync_state
|
||||
}
|
||||
Some((RangeSyncType::Finalized, start_slot, target_slot)) => {
|
||||
// If there is a backfill sync in progress pause it.
|
||||
// Range sync is in progress. If there is a backfill or custody sync in progress pause it.
|
||||
#[cfg(not(feature = "disable-backfill"))]
|
||||
self.backfill_sync.pause();
|
||||
self.custody_backfill_sync
|
||||
.pause("Range sync in progress".to_string());
|
||||
|
||||
SyncState::SyncingFinalized {
|
||||
start_slot,
|
||||
@@ -640,9 +704,12 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
}
|
||||
}
|
||||
Some((RangeSyncType::Head, start_slot, target_slot)) => {
|
||||
// If there is a backfill sync in progress pause it.
|
||||
// Range sync is in progress. If there is a backfill or custody backfill sync
|
||||
// in progress pause it.
|
||||
#[cfg(not(feature = "disable-backfill"))]
|
||||
self.backfill_sync.pause();
|
||||
self.custody_backfill_sync
|
||||
.pause("Range sync in progress".to_string());
|
||||
|
||||
SyncState::SyncingHead {
|
||||
start_slot,
|
||||
@@ -662,7 +729,9 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
if new_state.is_synced()
|
||||
&& !matches!(
|
||||
old_state,
|
||||
SyncState::Synced | SyncState::BackFillSyncing { .. }
|
||||
SyncState::Synced
|
||||
| SyncState::BackFillSyncing { .. }
|
||||
| SyncState::CustodyBackFillSyncing { .. }
|
||||
)
|
||||
{
|
||||
self.network.subscribe_core_topics();
|
||||
@@ -693,6 +762,11 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
|
||||
let mut register_metrics_interval = tokio::time::interval(Duration::from_secs(5));
|
||||
|
||||
// Trigger a sync state update every epoch. This helps check if we need to trigger a custody backfill sync.
|
||||
let epoch_duration =
|
||||
self.chain.slot_clock.slot_duration().as_secs() * T::EthSpec::slots_per_epoch();
|
||||
let mut epoch_interval = tokio::time::interval(Duration::from_secs(epoch_duration));
|
||||
|
||||
// process any inbound messages
|
||||
loop {
|
||||
tokio::select! {
|
||||
@@ -711,6 +785,9 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
_ = register_metrics_interval.tick() => {
|
||||
self.network.register_metrics();
|
||||
}
|
||||
_ = epoch_interval.tick() => {
|
||||
self.update_sync_state();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -865,6 +942,21 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
}
|
||||
}
|
||||
},
|
||||
SyncMessage::CustodyBatchProcessed { result, batch_id } => {
|
||||
match self.custody_backfill_sync.on_batch_process_result(
|
||||
&mut self.network,
|
||||
batch_id,
|
||||
&result,
|
||||
) {
|
||||
Ok(ProcessResult::Successful) => {}
|
||||
Ok(ProcessResult::SyncCompleted) => self.update_sync_state(),
|
||||
Err(error) => {
|
||||
error!(error = ?error, "Custody sync failed");
|
||||
// Update the global status
|
||||
self.update_sync_state();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1081,11 +1173,13 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
RpcEvent::from_chunk(data_column, seen_timestamp),
|
||||
);
|
||||
}
|
||||
SyncRequestId::DataColumnsByRange(id) => self.on_data_columns_by_range_response(
|
||||
id,
|
||||
peer_id,
|
||||
RpcEvent::from_chunk(data_column, seen_timestamp),
|
||||
),
|
||||
SyncRequestId::DataColumnsByRange(req_id) => {
|
||||
self.on_data_columns_by_range_response(
|
||||
req_id,
|
||||
peer_id,
|
||||
RpcEvent::from_chunk(data_column, seen_timestamp),
|
||||
);
|
||||
}
|
||||
_ => {
|
||||
crit!(%peer_id, "bad request id for data_column");
|
||||
}
|
||||
@@ -1173,11 +1267,22 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
.network
|
||||
.on_data_columns_by_range_response(id, peer_id, data_column)
|
||||
{
|
||||
self.on_range_components_response(
|
||||
id.parent_request_id,
|
||||
peer_id,
|
||||
RangeBlockComponent::CustodyColumns(id, resp),
|
||||
);
|
||||
match id.parent_request_id {
|
||||
DataColumnsByRangeRequester::ComponentsByRange(components_by_range_req_id) => {
|
||||
self.on_range_components_response(
|
||||
components_by_range_req_id,
|
||||
peer_id,
|
||||
RangeBlockComponent::CustodyColumns(id, resp),
|
||||
);
|
||||
}
|
||||
DataColumnsByRangeRequester::CustodyBackfillSync(custody_backfill_req_id) => self
|
||||
.on_custody_backfill_columns_response(
|
||||
custody_backfill_req_id,
|
||||
id,
|
||||
peer_id,
|
||||
resp,
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1267,6 +1372,36 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Handles receiving a response for a custody range sync request that has columns.
|
||||
fn on_custody_backfill_columns_response(
|
||||
&mut self,
|
||||
custody_sync_request_id: CustodyBackFillBatchRequestId,
|
||||
req_id: DataColumnsByRangeRequestId,
|
||||
peer_id: PeerId,
|
||||
data_columns: RpcResponseResult<Vec<Arc<DataColumnSidecar<T::EthSpec>>>>,
|
||||
) {
|
||||
if let Some(resp) = self.network.custody_backfill_data_columns_response(
|
||||
custody_sync_request_id,
|
||||
req_id,
|
||||
data_columns,
|
||||
) {
|
||||
match self.custody_backfill_sync.on_data_column_response(
|
||||
&mut self.network,
|
||||
custody_sync_request_id,
|
||||
&peer_id,
|
||||
resp,
|
||||
) {
|
||||
Ok(ProcessResult::SyncCompleted) => self.update_sync_state(),
|
||||
Ok(ProcessResult::Successful) => {}
|
||||
Err(_e) => {
|
||||
// The custody sync has failed, errors are reported
|
||||
// within.
|
||||
self.update_sync_state();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Result<AvailabilityProcessingStatus, BlockError>> for BlockProcessingResult {
|
||||
|
||||
@@ -2,14 +2,17 @@
|
||||
//!
|
||||
//! Stores the various syncing methods for the beacon chain.
|
||||
mod backfill_sync;
|
||||
mod batch;
|
||||
mod block_lookups;
|
||||
mod block_sidecar_coupling;
|
||||
mod custody_backfill_sync;
|
||||
pub mod manager;
|
||||
mod network_context;
|
||||
mod peer_sync_info;
|
||||
mod range_data_column_batch_request;
|
||||
mod range_sync;
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
pub use manager::{BatchProcessResult, SyncMessage};
|
||||
pub use range_sync::{BatchOperationOutcome, ChainId};
|
||||
pub use range_sync::ChainId;
|
||||
|
||||
@@ -6,16 +6,17 @@ pub use self::requests::{BlocksByRootSingleRequest, DataColumnsByRootSingleBlock
|
||||
use super::SyncMessage;
|
||||
use super::block_sidecar_coupling::RangeBlockComponentsRequest;
|
||||
use super::manager::BlockProcessType;
|
||||
use super::range_sync::ByRangeRequestType;
|
||||
use crate::metrics;
|
||||
use crate::network_beacon_processor::NetworkBeaconProcessor;
|
||||
#[cfg(test)]
|
||||
use crate::network_beacon_processor::TestBeaconChainType;
|
||||
use crate::service::NetworkMessage;
|
||||
use crate::status::ToStatusMessage;
|
||||
use crate::sync::batch::ByRangeRequestType;
|
||||
use crate::sync::block_lookups::SingleLookupId;
|
||||
use crate::sync::block_sidecar_coupling::CouplingError;
|
||||
use crate::sync::network_context::requests::BlobsByRootSingleBlockRequest;
|
||||
use crate::sync::range_data_column_batch_request::RangeDataColumnBatchRequest;
|
||||
use beacon_chain::block_verification_types::RpcBlock;
|
||||
use beacon_chain::{BeaconChain, BeaconChainTypes, BlockProcessStatus, EngineState};
|
||||
use custody::CustodyRequestResult;
|
||||
@@ -25,7 +26,8 @@ use lighthouse_network::rpc::{BlocksByRangeRequest, GoodbyeReason, RPCError, Req
|
||||
pub use lighthouse_network::service::api_types::RangeRequestId;
|
||||
use lighthouse_network::service::api_types::{
|
||||
AppRequestId, BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId,
|
||||
CustodyId, CustodyRequester, DataColumnsByRangeRequestId, DataColumnsByRootRequestId,
|
||||
CustodyBackFillBatchRequestId, CustodyBackfillBatchId, CustodyId, CustodyRequester,
|
||||
DataColumnsByRangeRequestId, DataColumnsByRangeRequester, DataColumnsByRootRequestId,
|
||||
DataColumnsByRootRequester, Id, SingleLookupReqId, SyncRequestId,
|
||||
};
|
||||
use lighthouse_network::{Client, NetworkGlobals, PeerAction, PeerId, ReportSource};
|
||||
@@ -211,7 +213,6 @@ pub struct SyncNetworkContext<T: BeaconChainTypes> {
|
||||
/// A mapping of active DataColumnsByRange requests
|
||||
data_columns_by_range_requests:
|
||||
ActiveRequests<DataColumnsByRangeRequestId, DataColumnsByRangeRequestItems<T::EthSpec>>,
|
||||
|
||||
/// Mapping of active custody column requests for a block root
|
||||
custody_by_root_requests: FnvHashMap<CustodyRequester, ActiveCustodyRequest<T>>,
|
||||
|
||||
@@ -219,6 +220,10 @@ pub struct SyncNetworkContext<T: BeaconChainTypes> {
|
||||
components_by_range_requests:
|
||||
FnvHashMap<ComponentsByRangeRequestId, RangeBlockComponentsRequest<T::EthSpec>>,
|
||||
|
||||
/// A batch of data columns by range request for custody sync
|
||||
custody_backfill_data_column_batch_requests:
|
||||
FnvHashMap<CustodyBackFillBatchRequestId, RangeDataColumnBatchRequest<T>>,
|
||||
|
||||
/// Whether the ee is online. If it's not, we don't allow access to the
|
||||
/// `beacon_processor_send`.
|
||||
execution_engine_state: EngineState,
|
||||
@@ -295,6 +300,7 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
data_columns_by_range_requests: ActiveRequests::new("data_columns_by_range"),
|
||||
custody_by_root_requests: <_>::default(),
|
||||
components_by_range_requests: FnvHashMap::default(),
|
||||
custody_backfill_data_column_batch_requests: FnvHashMap::default(),
|
||||
network_beacon_processor,
|
||||
chain,
|
||||
fork_context,
|
||||
@@ -324,6 +330,7 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
custody_by_root_requests: _,
|
||||
// components_by_range_requests is a meta request of various _by_range requests
|
||||
components_by_range_requests: _,
|
||||
custody_backfill_data_column_batch_requests: _,
|
||||
execution_engine_state: _,
|
||||
network_beacon_processor: _,
|
||||
chain: _,
|
||||
@@ -354,7 +361,6 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
.active_requests_of_peer(peer_id)
|
||||
.into_iter()
|
||||
.map(|req_id| SyncRequestId::DataColumnsByRange(*req_id));
|
||||
|
||||
blocks_by_root_ids
|
||||
.chain(blobs_by_root_ids)
|
||||
.chain(data_column_by_root_ids)
|
||||
@@ -421,6 +427,7 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
custody_by_root_requests: _,
|
||||
// components_by_range_requests is a meta request of various _by_range requests
|
||||
components_by_range_requests: _,
|
||||
custody_backfill_data_column_batch_requests: _,
|
||||
execution_engine_state: _,
|
||||
network_beacon_processor: _,
|
||||
chain: _,
|
||||
@@ -503,7 +510,7 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
count: *request.count(),
|
||||
columns,
|
||||
},
|
||||
id,
|
||||
DataColumnsByRangeRequester::ComponentsByRange(id),
|
||||
new_range_request_span!(
|
||||
self,
|
||||
"outgoing_columns_by_range_retry",
|
||||
@@ -638,7 +645,7 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
count: *request.count(),
|
||||
columns,
|
||||
},
|
||||
id,
|
||||
DataColumnsByRangeRequester::ComponentsByRange(id),
|
||||
new_range_request_span!(
|
||||
self,
|
||||
"outgoing_columns_by_range",
|
||||
@@ -1238,7 +1245,7 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
&mut self,
|
||||
peer_id: PeerId,
|
||||
request: DataColumnsByRangeRequest,
|
||||
parent_request_id: ComponentsByRangeRequestId,
|
||||
parent_request_id: DataColumnsByRangeRequester,
|
||||
request_span: Span,
|
||||
) -> Result<(DataColumnsByRangeRequestId, Vec<u64>), RpcRequestSendError> {
|
||||
let requested_columns = request.columns.clone();
|
||||
@@ -1679,6 +1686,111 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
})
|
||||
}
|
||||
|
||||
/// data column by range requests sent by the custody sync algorithm
|
||||
pub fn custody_backfill_data_columns_batch_request(
|
||||
&mut self,
|
||||
request: DataColumnsByRangeRequest,
|
||||
batch_id: CustodyBackfillBatchId,
|
||||
peers: &HashSet<PeerId>,
|
||||
peers_to_deprioritize: &HashSet<PeerId>,
|
||||
) -> Result<CustodyBackFillBatchRequestId, RpcRequestSendError> {
|
||||
let active_request_count_by_peer = self.active_request_count_by_peer();
|
||||
// Attempt to find all required custody peers before sending any request or creating an ID
|
||||
let columns_by_range_peers_to_request = {
|
||||
let column_indexes = self
|
||||
.chain
|
||||
.sampling_columns_for_epoch(batch_id.epoch)
|
||||
.iter()
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
self.select_columns_by_range_peers_to_request(
|
||||
&column_indexes,
|
||||
peers,
|
||||
active_request_count_by_peer,
|
||||
peers_to_deprioritize,
|
||||
)?
|
||||
};
|
||||
|
||||
// Create the overall `custody_by_range` request id
|
||||
let id = CustodyBackFillBatchRequestId {
|
||||
id: self.next_id(),
|
||||
batch_id,
|
||||
};
|
||||
|
||||
let result = columns_by_range_peers_to_request
|
||||
.iter()
|
||||
.filter_map(|(peer_id, _)| {
|
||||
self.send_data_columns_by_range_request(
|
||||
*peer_id,
|
||||
request.clone(),
|
||||
DataColumnsByRangeRequester::CustodyBackfillSync(id),
|
||||
Span::none(),
|
||||
)
|
||||
.ok()
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let range_data_column_batch_request =
|
||||
RangeDataColumnBatchRequest::new(result, self.chain.clone(), batch_id.epoch);
|
||||
|
||||
self.custody_backfill_data_column_batch_requests
|
||||
.insert(id, range_data_column_batch_request);
|
||||
|
||||
Ok(id)
|
||||
}
|
||||
|
||||
/// Received a data columns by range response from a custody sync request which batches them.
|
||||
pub fn custody_backfill_data_columns_response(
|
||||
&mut self,
|
||||
// Identifies the custody backfill request for all data columns on this epoch
|
||||
custody_sync_request_id: CustodyBackFillBatchRequestId,
|
||||
// Identifies a specific data_columns_by_range request for *some* columns in this epoch. We
|
||||
// pass them separately as DataColumnsByRangeRequestId parent is an enum and would require
|
||||
// matching again.
|
||||
req_id: DataColumnsByRangeRequestId,
|
||||
data_columns: RpcResponseResult<DataColumnSidecarList<T::EthSpec>>,
|
||||
) -> Option<Result<DataColumnSidecarList<T::EthSpec>, RpcResponseError>> {
|
||||
let Entry::Occupied(mut entry) = self
|
||||
.custody_backfill_data_column_batch_requests
|
||||
.entry(custody_sync_request_id)
|
||||
else {
|
||||
metrics::inc_counter_vec(
|
||||
&metrics::SYNC_UNKNOWN_NETWORK_REQUESTS,
|
||||
&["range_data_columns"],
|
||||
);
|
||||
return None;
|
||||
};
|
||||
|
||||
if let Err(e) = {
|
||||
let request = entry.get_mut();
|
||||
data_columns.and_then(|(data_columns, _)| {
|
||||
request
|
||||
.add_custody_columns(req_id, data_columns.clone())
|
||||
.map_err(|e| {
|
||||
RpcResponseError::BlockComponentCouplingError(CouplingError::InternalError(
|
||||
e,
|
||||
))
|
||||
})
|
||||
})
|
||||
} {
|
||||
entry.remove();
|
||||
return Some(Err(e));
|
||||
}
|
||||
|
||||
if let Some(data_column_result) = entry.get_mut().responses() {
|
||||
if data_column_result.is_ok() {
|
||||
// remove the entry only if it coupled successfully with
|
||||
// no errors
|
||||
entry.remove();
|
||||
}
|
||||
// If the request is finished, dequeue everything
|
||||
Some(data_column_result.map_err(RpcResponseError::BlockComponentCouplingError))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn register_metrics(&self) {
|
||||
for (id, count) in [
|
||||
("blocks_by_root", self.blocks_by_root_requests.len()),
|
||||
|
||||
297
beacon_node/network/src/sync/range_data_column_batch_request.rs
Normal file
297
beacon_node/network/src/sync/range_data_column_batch_request.rs
Normal file
@@ -0,0 +1,297 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use crate::sync::block_sidecar_coupling::{ByRangeRequest, CouplingError};
|
||||
use crate::sync::network_context::MAX_COLUMN_RETRIES;
|
||||
use beacon_chain::{BeaconChain, BeaconChainTypes};
|
||||
use itertools::Itertools;
|
||||
use lighthouse_network::PeerId;
|
||||
use lighthouse_network::service::api_types::DataColumnsByRangeRequestId;
|
||||
use std::sync::Arc;
|
||||
use types::{ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Epoch, EthSpec, Slot};
|
||||
|
||||
pub struct RangeDataColumnBatchRequest<T: BeaconChainTypes> {
|
||||
requests: HashMap<
|
||||
DataColumnsByRangeRequestId,
|
||||
ByRangeRequest<DataColumnsByRangeRequestId, DataColumnSidecarList<T::EthSpec>>,
|
||||
>,
|
||||
/// The column indices corresponding to the request
|
||||
column_peers: HashMap<DataColumnsByRangeRequestId, Vec<ColumnIndex>>,
|
||||
expected_custody_columns: HashSet<ColumnIndex>,
|
||||
attempt: usize,
|
||||
beacon_chain: Arc<BeaconChain<T>>,
|
||||
epoch: Epoch,
|
||||
}
|
||||
|
||||
impl<T: BeaconChainTypes> RangeDataColumnBatchRequest<T> {
|
||||
pub fn new(
|
||||
by_range_requests: Vec<(DataColumnsByRangeRequestId, Vec<ColumnIndex>)>,
|
||||
beacon_chain: Arc<BeaconChain<T>>,
|
||||
epoch: Epoch,
|
||||
) -> Self {
|
||||
let requests = by_range_requests
|
||||
.clone()
|
||||
.into_iter()
|
||||
.map(|(req, _)| (req, ByRangeRequest::Active(req)))
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
let column_peers = by_range_requests.clone().into_iter().collect();
|
||||
|
||||
let expected_custody_columns = by_range_requests
|
||||
.into_iter()
|
||||
.flat_map(|(_, column_indices)| column_indices)
|
||||
.collect();
|
||||
|
||||
Self {
|
||||
requests,
|
||||
column_peers,
|
||||
expected_custody_columns,
|
||||
beacon_chain,
|
||||
epoch,
|
||||
attempt: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_custody_columns(
|
||||
&mut self,
|
||||
req_id: DataColumnsByRangeRequestId,
|
||||
columns: Vec<Arc<DataColumnSidecar<T::EthSpec>>>,
|
||||
) -> Result<(), String> {
|
||||
let req = self
|
||||
.requests
|
||||
.get_mut(&req_id)
|
||||
.ok_or(format!("unknown data columns by range req_id {req_id}"))?;
|
||||
req.finish(req_id, columns)
|
||||
}
|
||||
|
||||
pub fn responses(
|
||||
&mut self,
|
||||
) -> Option<Result<DataColumnSidecarList<T::EthSpec>, CouplingError>> {
|
||||
let mut received_columns_for_slot: HashMap<Slot, DataColumnSidecarList<T::EthSpec>> =
|
||||
HashMap::new();
|
||||
let mut column_to_peer_id: HashMap<u64, PeerId> = HashMap::new();
|
||||
|
||||
for column in self
|
||||
.requests
|
||||
.values()
|
||||
.filter_map(|req| req.to_finished())
|
||||
.flatten()
|
||||
{
|
||||
received_columns_for_slot
|
||||
.entry(column.slot())
|
||||
.or_default()
|
||||
.push(column.clone());
|
||||
}
|
||||
|
||||
// Note: this assumes that only 1 peer is responsible for a column
|
||||
// with a batch.
|
||||
for (id, columns) in self.column_peers.iter() {
|
||||
for column in columns {
|
||||
column_to_peer_id.insert(*column, id.peer);
|
||||
}
|
||||
}
|
||||
|
||||
// An "attempt" is complete here after we have received a response for all the
|
||||
// requests we made. i.e. `req.to_finished()` returns Some for all requests.
|
||||
self.attempt += 1;
|
||||
|
||||
let resp = self.responses_with_custody_columns(
|
||||
received_columns_for_slot,
|
||||
column_to_peer_id,
|
||||
&self.expected_custody_columns,
|
||||
self.attempt,
|
||||
);
|
||||
|
||||
if let Err(CouplingError::DataColumnPeerFailure {
|
||||
error: _,
|
||||
faulty_peers,
|
||||
exceeded_retries: _,
|
||||
}) = &resp
|
||||
{
|
||||
for (_, peer) in faulty_peers.iter() {
|
||||
// find the req id associated with the peer and
|
||||
// delete it from the entries as we are going to make
|
||||
// a separate attempt for those components.
|
||||
self.requests.retain(|&k, _| k.peer != *peer);
|
||||
}
|
||||
}
|
||||
Some(resp)
|
||||
}
|
||||
|
||||
fn responses_with_custody_columns(
|
||||
&self,
|
||||
mut received_columns_for_slot: HashMap<Slot, DataColumnSidecarList<T::EthSpec>>,
|
||||
column_to_peer: HashMap<ColumnIndex, PeerId>,
|
||||
expected_custody_columns: &HashSet<ColumnIndex>,
|
||||
attempt: usize,
|
||||
) -> Result<DataColumnSidecarList<T::EthSpec>, CouplingError> {
|
||||
let mut naughty_peers = vec![];
|
||||
let mut result: DataColumnSidecarList<T::EthSpec> = vec![];
|
||||
|
||||
let forward_blocks_iter = self
|
||||
.beacon_chain
|
||||
.forwards_iter_block_roots_until(
|
||||
self.epoch.start_slot(T::EthSpec::slots_per_epoch()),
|
||||
self.epoch.end_slot(T::EthSpec::slots_per_epoch()),
|
||||
)
|
||||
.map_err(|_| {
|
||||
CouplingError::InternalError("Failed to fetch block root iterator".to_string())
|
||||
})?;
|
||||
|
||||
for block_iter_result in forward_blocks_iter {
|
||||
let (block_root, slot) = block_iter_result.map_err(|_| {
|
||||
CouplingError::InternalError("Failed to iterate block roots".to_string())
|
||||
})?;
|
||||
|
||||
let Some(block) = self
|
||||
.beacon_chain
|
||||
.get_blinded_block(&block_root)
|
||||
.ok()
|
||||
.flatten()
|
||||
else {
|
||||
// The block root we are fetching is from the forwards block root iterator. This doesn't seem like a possible scenario.
|
||||
return Err(CouplingError::InternalError(
|
||||
"Block root from forwards block iterator not found in db".to_string(),
|
||||
));
|
||||
};
|
||||
|
||||
let Some(columns) = received_columns_for_slot.remove(&slot) else {
|
||||
// If at least one blob is expected for this slot but none have been served, penalize all peers
|
||||
// The slot check ensures we arent checking a skipped slot.
|
||||
if block.num_expected_blobs() != 0 && block.slot() == slot {
|
||||
for column in expected_custody_columns {
|
||||
if let Some(naughty_peer) = column_to_peer.get(column) {
|
||||
naughty_peers.push((*column, *naughty_peer));
|
||||
}
|
||||
}
|
||||
}
|
||||
continue;
|
||||
};
|
||||
|
||||
// This is a skipped slot, skip to the next slot after we verify that peers
|
||||
// didn't serve us columns for a skipped slot
|
||||
if block.slot() != slot {
|
||||
// If we received columns for a skipped slot, punish the peer
|
||||
if !columns.is_empty() {
|
||||
for column in expected_custody_columns {
|
||||
if let Some(naughty_peer) = column_to_peer.get(column) {
|
||||
naughty_peers.push((*column, *naughty_peer));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
let column_block_roots = columns
|
||||
.iter()
|
||||
.map(|column| column.block_root())
|
||||
.unique()
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let column_block_signatures = columns
|
||||
.iter()
|
||||
.map(|column| column.signed_block_header.signature.clone())
|
||||
.unique()
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let column_block_root = match column_block_roots.as_slice() {
|
||||
// We expect a single unique block root
|
||||
[column_block_root] => *column_block_root,
|
||||
// If there are no block roots, penalize all peers
|
||||
[] => {
|
||||
for column in &columns {
|
||||
if let Some(naughty_peer) = column_to_peer.get(&column.index) {
|
||||
naughty_peers.push((column.index, *naughty_peer));
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
// If theres more than one unique block root penalize the peers serving the bad block roots.
|
||||
column_block_roots => {
|
||||
for column in columns {
|
||||
if column_block_roots.contains(&column.block_root())
|
||||
&& block_root != column.block_root()
|
||||
&& let Some(naughty_peer) = column_to_peer.get(&column.index)
|
||||
{
|
||||
naughty_peers.push((column.index, *naughty_peer));
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let column_block_signature = match column_block_signatures.as_slice() {
|
||||
// We expect a single unique block signature
|
||||
[block_signature] => block_signature,
|
||||
// If there are no block signatures, penalize all peers
|
||||
[] => {
|
||||
for column in &columns {
|
||||
if let Some(naughty_peer) = column_to_peer.get(&column.index) {
|
||||
naughty_peers.push((column.index, *naughty_peer));
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
// If theres more than one unique block signature, penalize the peers serving the
|
||||
// invalid block signatures.
|
||||
column_block_signatures => {
|
||||
for column in columns {
|
||||
if column_block_signatures.contains(&column.signed_block_header.signature)
|
||||
&& block.signature() != &column.signed_block_header.signature
|
||||
&& let Some(naughty_peer) = column_to_peer.get(&column.index)
|
||||
{
|
||||
naughty_peers.push((column.index, *naughty_peer));
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
// if the block root doesn't match the columns block root, penalize the peers
|
||||
if block_root != column_block_root {
|
||||
for column in &columns {
|
||||
if let Some(naughty_peer) = column_to_peer.get(&column.index) {
|
||||
naughty_peers.push((column.index, *naughty_peer));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If the block signature doesn't match the columns block signature, penalize the peers
|
||||
if block.signature() != column_block_signature {
|
||||
for column in &columns {
|
||||
if let Some(naughty_peer) = column_to_peer.get(&column.index) {
|
||||
naughty_peers.push((column.index, *naughty_peer));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let received_columns = columns.iter().map(|c| c.index).collect::<HashSet<_>>();
|
||||
|
||||
let missing_columns = received_columns
|
||||
.difference(expected_custody_columns)
|
||||
.collect::<HashSet<_>>();
|
||||
|
||||
// blobs are expected for this slot but there is at least one missing columns
|
||||
// penalize the peers responsible for those columns.
|
||||
if block.num_expected_blobs() != 0 && !missing_columns.is_empty() {
|
||||
for column in missing_columns {
|
||||
if let Some(naughty_peer) = column_to_peer.get(column) {
|
||||
naughty_peers.push((*column, *naughty_peer));
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
result.extend(columns);
|
||||
}
|
||||
|
||||
if !naughty_peers.is_empty() {
|
||||
return Err(CouplingError::DataColumnPeerFailure {
|
||||
error: "Bad or missing columns for some slots".to_string(),
|
||||
faulty_peers: naughty_peers,
|
||||
exceeded_retries: attempt >= MAX_COLUMN_RETRIES,
|
||||
});
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
@@ -1,10 +1,13 @@
|
||||
use super::RangeSyncType;
|
||||
use super::batch::{BatchInfo, BatchProcessingResult, BatchState};
|
||||
use crate::metrics;
|
||||
use crate::network_beacon_processor::ChainSegmentProcessId;
|
||||
use crate::sync::batch::BatchId;
|
||||
use crate::sync::batch::{
|
||||
BatchConfig, BatchInfo, BatchOperationOutcome, BatchProcessingResult, BatchState,
|
||||
};
|
||||
use crate::sync::block_sidecar_coupling::CouplingError;
|
||||
use crate::sync::network_context::{RangeRequestId, RpcRequestSendError, RpcResponseError};
|
||||
use crate::sync::{BatchOperationOutcome, BatchProcessResult, network_context::SyncNetworkContext};
|
||||
use crate::sync::{BatchProcessResult, network_context::SyncNetworkContext};
|
||||
use beacon_chain::BeaconChainTypes;
|
||||
use beacon_chain::block_verification_types::RpcBlock;
|
||||
use lighthouse_network::service::api_types::Id;
|
||||
@@ -12,6 +15,8 @@ use lighthouse_network::{PeerAction, PeerId};
|
||||
use lighthouse_tracing::SPAN_SYNCING_CHAIN;
|
||||
use logging::crit;
|
||||
use std::collections::{BTreeMap, HashSet, btree_map::Entry};
|
||||
use std::hash::{Hash, Hasher};
|
||||
use std::marker::PhantomData;
|
||||
use strum::IntoStaticStr;
|
||||
use tracing::{Span, debug, instrument, warn};
|
||||
use types::{ColumnIndex, Epoch, EthSpec, Hash256, Slot};
|
||||
@@ -35,6 +40,35 @@ const BATCH_BUFFER_SIZE: u8 = 5;
|
||||
/// and continued is now in an inconsistent state.
|
||||
pub type ProcessingResult = Result<KeepChain, RemoveChain>;
|
||||
|
||||
type RpcBlocks<E> = Vec<RpcBlock<E>>;
|
||||
type RangeSyncBatchInfo<E> = BatchInfo<E, RangeSyncBatchConfig<E>, RpcBlocks<E>>;
|
||||
type RangeSyncBatches<E> = BTreeMap<BatchId, RangeSyncBatchInfo<E>>;
|
||||
|
||||
/// The number of times to retry a batch before it is considered failed.
|
||||
const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 5;
|
||||
|
||||
/// Invalid batches are attempted to be re-downloaded from other peers. If a batch cannot be processed
|
||||
/// after `MAX_BATCH_PROCESSING_ATTEMPTS` times, it is considered faulty.
|
||||
const MAX_BATCH_PROCESSING_ATTEMPTS: u8 = 3;
|
||||
|
||||
pub struct RangeSyncBatchConfig<E: EthSpec> {
|
||||
marker: PhantomData<E>,
|
||||
}
|
||||
|
||||
impl<E: EthSpec> BatchConfig for RangeSyncBatchConfig<E> {
|
||||
fn max_batch_download_attempts() -> u8 {
|
||||
MAX_BATCH_DOWNLOAD_ATTEMPTS
|
||||
}
|
||||
fn max_batch_processing_attempts() -> u8 {
|
||||
MAX_BATCH_PROCESSING_ATTEMPTS
|
||||
}
|
||||
fn batch_attempt_hash<D: Hash>(data: &D) -> u64 {
|
||||
let mut hasher = std::collections::hash_map::DefaultHasher::new();
|
||||
data.hash(&mut hasher);
|
||||
hasher.finish()
|
||||
}
|
||||
}
|
||||
|
||||
/// Reasons for removing a chain
|
||||
#[derive(Debug)]
|
||||
#[allow(dead_code)]
|
||||
@@ -55,7 +89,6 @@ pub struct KeepChain;
|
||||
|
||||
/// A chain identifier
|
||||
pub type ChainId = Id;
|
||||
pub type BatchId = Epoch;
|
||||
|
||||
#[derive(Debug, Copy, Clone, IntoStaticStr)]
|
||||
pub enum SyncingChainType {
|
||||
@@ -85,7 +118,7 @@ pub struct SyncingChain<T: BeaconChainTypes> {
|
||||
pub target_head_root: Hash256,
|
||||
|
||||
/// Sorted map of batches undergoing some kind of processing.
|
||||
batches: BTreeMap<BatchId, BatchInfo<T::EthSpec>>,
|
||||
batches: RangeSyncBatches<T::EthSpec>,
|
||||
|
||||
/// The peers that agree on the `target_head_slot` and `target_head_root` as a canonical chain
|
||||
/// and thus available to download this chain from, as well as the batches we are currently
|
||||
@@ -249,7 +282,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
// request_id matches
|
||||
// TODO(das): removed peer_id matching as the node may request a different peer for data
|
||||
// columns.
|
||||
if !batch.is_expecting_block(&request_id) {
|
||||
if !batch.is_expecting_request_id(&request_id) {
|
||||
return Ok(KeepChain);
|
||||
}
|
||||
batch
|
||||
@@ -260,7 +293,8 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
// Remove the request from the peer's active batches
|
||||
|
||||
// TODO(das): should use peer group here https://github.com/sigp/lighthouse/issues/6258
|
||||
let received = batch.download_completed(blocks, *peer_id)?;
|
||||
let received = blocks.len();
|
||||
batch.download_completed(blocks, *peer_id)?;
|
||||
let awaiting_batches = batch_id
|
||||
.saturating_sub(self.optimistic_start.unwrap_or(self.processing_target))
|
||||
/ EPOCHS_PER_BATCH;
|
||||
@@ -918,7 +952,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
// A batch could be retried without the peer failing the request (disconnecting/
|
||||
// sending an error /timeout) if the peer is removed from the chain for other
|
||||
// reasons. Check that this block belongs to the expected peer
|
||||
if !batch.is_expecting_block(&request_id) {
|
||||
if !batch.is_expecting_request_id(&request_id) {
|
||||
debug!(
|
||||
batch_epoch = %batch_id,
|
||||
batch_state = ?batch.state(),
|
||||
@@ -1233,7 +1267,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
// only request batches up to the buffer size limit
|
||||
// NOTE: we don't count batches in the AwaitingValidation state, to prevent stalling sync
|
||||
// if the current processing window is contained in a long range of skip slots.
|
||||
let in_buffer = |batch: &BatchInfo<T::EthSpec>| {
|
||||
let in_buffer = |batch: &RangeSyncBatchInfo<T::EthSpec>| {
|
||||
matches!(
|
||||
batch.state(),
|
||||
BatchState::Downloading(..) | BatchState::AwaitingProcessing(..)
|
||||
@@ -1320,7 +1354,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
}
|
||||
}
|
||||
|
||||
use super::batch::WrongState as WrongBatchState;
|
||||
use crate::sync::batch::WrongState as WrongBatchState;
|
||||
impl From<WrongBatchState> for RemoveChain {
|
||||
fn from(err: WrongBatchState) -> Self {
|
||||
RemoveChain::WrongBatchState(err.0)
|
||||
|
||||
@@ -1,17 +1,11 @@
|
||||
//! This provides the logic for syncing a chain when the local node is far behind it's current
|
||||
//! peers.
|
||||
|
||||
mod batch;
|
||||
mod chain;
|
||||
mod chain_collection;
|
||||
mod range;
|
||||
mod sync_type;
|
||||
|
||||
pub use batch::{
|
||||
BatchConfig, BatchInfo, BatchOperationOutcome, BatchProcessingResult, BatchState,
|
||||
ByRangeRequestType,
|
||||
};
|
||||
pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH};
|
||||
pub use chain::{ChainId, EPOCHS_PER_BATCH};
|
||||
#[cfg(test)]
|
||||
pub use chain_collection::SyncChainStatus;
|
||||
pub use range::RangeSync;
|
||||
|
||||
@@ -39,12 +39,13 @@
|
||||
//! Each chain is downloaded in batches of blocks. The batched blocks are processed sequentially
|
||||
//! and further batches are requested as current blocks are being processed.
|
||||
|
||||
use super::chain::{BatchId, ChainId, RemoveChain, SyncingChain};
|
||||
use super::chain::{ChainId, RemoveChain, SyncingChain};
|
||||
use super::chain_collection::{ChainCollection, SyncChainStatus};
|
||||
use super::sync_type::RangeSyncType;
|
||||
use crate::metrics;
|
||||
use crate::status::ToStatusMessage;
|
||||
use crate::sync::BatchProcessResult;
|
||||
use crate::sync::batch::BatchId;
|
||||
use crate::sync::network_context::{RpcResponseError, SyncNetworkContext};
|
||||
use beacon_chain::block_verification_types::RpcBlock;
|
||||
use beacon_chain::{BeaconChain, BeaconChainTypes};
|
||||
|
||||
Reference in New Issue
Block a user