Fix PeerDAS sync scoring (#7352)

* Remove request tracking inside syncing chains

* Prioritize by range peers in network context

* Prioritize custody peers for columns by range

* Explicit error handling of the no peers error case

* Remove good_peers_on_sampling_subnets

* Count AwaitingDownload towards the buffer limit

* Retry syncing chains in AwaitingDownload state

* Use same peer priorization for lookups

* Review PR

* Address TODOs

* Revert changes to peer erroring in range sync

* Revert metrics changes

* Update comment

* Pass peers_to_deprioritize to select_columns_by_range_peers_to_request

* more idiomatic

* Idiomatic while

* Add note about infinite loop

* Use while let

* Fix wrong custody column count for lookup blocks

* Remove impl

* Remove stale comment

* Fix build errors.

* Or default

* Review PR

* BatchPeerGroup

* Match block and blob signatures

* Explicit match statement to BlockError in range sync

* Remove todo in BatchPeerGroup

* Remove participating peers from backfill sync

* Remove MissingAllCustodyColumns error

* Merge fixes

* Clean up PR

* Consistent naming of batch_peers

* Address multiple review comments

* Better errors for das

* Penalize column peers once

* Restore fn

* Fix error enum

* Removed MismatchedPublicKeyLen

* Revert testing changes

* Change BlockAndCustodyColumns enum variant

* Revert type change in import_historical_block_batch

* Drop pubkey cache

* Don't collect Vec

* Classify errors

* Remove ReconstructColumnsError

* More detailed UnrequestedSlot error

* Lint test

* Fix slot conversion

* Reduce penalty for missing blobs

* Revert changes in peer selection

* Lint tests

* Rename block matching functions

* Reorder block matching in historical blocks

* Fix order of block matching

* Add store tests

* Filter blockchain in assert_correct_historical_block_chain

* Also filter before KZG checks

* Lint tests

* Fix lint

* Fix fulu err assertion

* Check point is not at infinity

* Fix ws sync test

* Revert dropping filter fn

---------

Co-authored-by: Jimmy Chen <jchen.tc@gmail.com>
Co-authored-by: Jimmy Chen <jimmy@sigmaprime.io>
Co-authored-by: Pawan Dhananjay <pawandhananjay@gmail.com>
This commit is contained in:
Lion - dapplion
2025-05-21 08:06:42 -05:00
committed by GitHub
parent f06d1d0346
commit b014675b7a
27 changed files with 1103 additions and 654 deletions

View File

@@ -2,13 +2,13 @@ use beacon_chain::block_verification_types::RpcBlock;
use lighthouse_network::rpc::methods::BlocksByRangeRequest;
use lighthouse_network::service::api_types::Id;
use lighthouse_network::PeerId;
use std::collections::HashSet;
use std::collections::{HashMap, HashSet};
use std::fmt;
use std::hash::{Hash, Hasher};
use std::ops::Sub;
use std::time::{Duration, Instant};
use strum::Display;
use types::{Epoch, EthSpec, Slot};
use types::{ColumnIndex, Epoch, EthSpec, Slot};
/// The number of times to retry a batch before it is considered failed.
const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 5;
@@ -26,6 +26,35 @@ pub enum ByRangeRequestType {
Blocks,
}
#[derive(Clone, Debug)]
pub struct BatchPeers {
block_peer: PeerId,
column_peers: HashMap<ColumnIndex, PeerId>,
}
impl BatchPeers {
pub fn new_from_block_peer(block_peer: PeerId) -> Self {
Self {
block_peer,
column_peers: <_>::default(),
}
}
pub fn new(block_peer: PeerId, column_peers: HashMap<ColumnIndex, PeerId>) -> Self {
Self {
block_peer,
column_peers,
}
}
pub fn block(&self) -> PeerId {
self.block_peer
}
pub fn column(&self, index: &ColumnIndex) -> Option<&PeerId> {
self.column_peers.get(index)
}
}
/// Allows customisation of the above constants used in other sync methods such as BackFillSync.
pub trait BatchConfig {
/// The maximum batch download attempts.
@@ -110,8 +139,6 @@ pub struct BatchInfo<E: EthSpec, B: BatchConfig = RangeSyncBatchConfig> {
failed_download_attempts: Vec<Option<PeerId>>,
/// State of the batch.
state: BatchState<E>,
/// Whether this batch contains all blocks or all blocks and blobs.
batch_type: ByRangeRequestType,
/// Pin the generic
marker: std::marker::PhantomData<B>,
}
@@ -134,7 +161,7 @@ pub enum BatchState<E: EthSpec> {
/// The batch is being downloaded.
Downloading(Id),
/// The batch has been completely downloaded and is ready for processing.
AwaitingProcessing(PeerId, Vec<RpcBlock<E>>, Instant),
AwaitingProcessing(BatchPeers, Vec<RpcBlock<E>>, Instant),
/// The batch is being processed.
Processing(Attempt),
/// The batch was successfully processed and is waiting to be validated.
@@ -171,7 +198,7 @@ impl<E: EthSpec, B: BatchConfig> BatchInfo<E, B> {
/// fork boundary will be of mixed type (all blocks and one last blockblob), and I don't want to
/// deal with this for now.
/// This means finalization might be slower in deneb
pub fn new(start_epoch: &Epoch, num_of_epochs: u64, batch_type: ByRangeRequestType) -> Self {
pub fn new(start_epoch: &Epoch, num_of_epochs: u64) -> Self {
let start_slot = start_epoch.start_slot(E::slots_per_epoch());
let end_slot = start_slot + num_of_epochs * E::slots_per_epoch();
BatchInfo {
@@ -181,20 +208,22 @@ impl<E: EthSpec, B: BatchConfig> BatchInfo<E, B> {
failed_download_attempts: Vec::new(),
non_faulty_processing_attempts: 0,
state: BatchState::AwaitingDownload,
batch_type,
marker: std::marker::PhantomData,
}
}
/// Gives a list of peers from which this batch has had a failed download or processing
/// attempt.
pub fn failed_peers(&self) -> HashSet<PeerId> {
///
/// TODO(das): Returns only block peers to keep the mainnet path equivalent. The failed peers
/// mechanism is broken for PeerDAS and will be fixed with https://github.com/sigp/lighthouse/issues/6258
pub fn failed_block_peers(&self) -> HashSet<PeerId> {
let mut peers = HashSet::with_capacity(
self.failed_processing_attempts.len() + self.failed_download_attempts.len(),
);
for attempt in &self.failed_processing_attempts {
peers.insert(attempt.peer_id);
peers.insert(attempt.peers.block());
}
for peer in self.failed_download_attempts.iter().flatten() {
@@ -212,13 +241,13 @@ impl<E: EthSpec, B: BatchConfig> BatchInfo<E, B> {
false
}
/// Returns the peer that is currently responsible for progressing the state of the batch.
pub fn processing_peer(&self) -> Option<&PeerId> {
/// Returns the peers that provided this batch's downloaded contents
pub fn processing_peers(&self) -> Option<&BatchPeers> {
match &self.state {
BatchState::AwaitingDownload | BatchState::Failed | BatchState::Downloading(..) => None,
BatchState::AwaitingProcessing(peer_id, _, _)
| BatchState::Processing(Attempt { peer_id, .. })
| BatchState::AwaitingValidation(Attempt { peer_id, .. }) => Some(peer_id),
BatchState::AwaitingProcessing(peers, _, _)
| BatchState::Processing(Attempt { peers, .. })
| BatchState::AwaitingValidation(Attempt { peers, .. }) => Some(peers),
BatchState::Poisoned => unreachable!("Poisoned batch"),
}
}
@@ -237,13 +266,10 @@ impl<E: EthSpec, B: BatchConfig> BatchInfo<E, B> {
}
/// Returns a BlocksByRange request associated with the batch.
pub fn to_blocks_by_range_request(&self) -> (BlocksByRangeRequest, ByRangeRequestType) {
(
BlocksByRangeRequest::new(
self.start_slot.into(),
self.end_slot.sub(self.start_slot).into(),
),
self.batch_type,
pub fn to_blocks_by_range_request(&self) -> BlocksByRangeRequest {
BlocksByRangeRequest::new(
self.start_slot.into(),
self.end_slot.sub(self.start_slot).into(),
)
}
@@ -275,12 +301,12 @@ impl<E: EthSpec, B: BatchConfig> BatchInfo<E, B> {
pub fn download_completed(
&mut self,
blocks: Vec<RpcBlock<E>>,
peer: PeerId,
batch_peers: BatchPeers,
) -> Result<usize /* Received blocks */, WrongState> {
match self.state.poison() {
BatchState::Downloading(_) => {
BatchState::Downloading(_request_id) => {
let received = blocks.len();
self.state = BatchState::AwaitingProcessing(peer, blocks, Instant::now());
self.state = BatchState::AwaitingProcessing(batch_peers, blocks, Instant::now());
Ok(received)
}
BatchState::Poisoned => unreachable!("Poisoned batch"),
@@ -305,10 +331,9 @@ impl<E: EthSpec, B: BatchConfig> BatchInfo<E, B> {
peer: Option<PeerId>,
) -> Result<BatchOperationOutcome, WrongState> {
match self.state.poison() {
BatchState::Downloading(_) => {
BatchState::Downloading(_request_id) => {
// register the attempt and check if the batch can be tried again
self.failed_download_attempts.push(peer);
self.state = if self.failed_download_attempts.len()
>= B::max_batch_download_attempts() as usize
{
@@ -349,8 +374,8 @@ impl<E: EthSpec, B: BatchConfig> BatchInfo<E, B> {
pub fn start_processing(&mut self) -> Result<(Vec<RpcBlock<E>>, Duration), WrongState> {
match self.state.poison() {
BatchState::AwaitingProcessing(peer, blocks, start_instant) => {
self.state = BatchState::Processing(Attempt::new::<B, E>(peer, &blocks));
BatchState::AwaitingProcessing(peers, blocks, start_instant) => {
self.state = BatchState::Processing(Attempt::new::<B, E>(peers, &blocks));
Ok((blocks, start_instant.elapsed()))
}
BatchState::Poisoned => unreachable!("Poisoned batch"),
@@ -438,39 +463,41 @@ impl<E: EthSpec, B: BatchConfig> BatchInfo<E, B> {
}
}
/// Represents a peer's attempt and providing the result for this batch.
/// Represents a batch attempt awaiting validation
///
/// Invalid attempts will downscore a peer.
#[derive(PartialEq, Debug)]
/// Invalid attempts will downscore its peers
#[derive(Debug)]
pub struct Attempt {
/// The peer that made the attempt.
pub peer_id: PeerId,
/// The peers that served this batch contents
peers: BatchPeers,
/// The hash of the blocks of the attempt.
pub hash: u64,
}
impl Attempt {
fn new<B: BatchConfig, E: EthSpec>(peer_id: PeerId, blocks: &[RpcBlock<E>]) -> Self {
fn new<B: BatchConfig, E: EthSpec>(peers: BatchPeers, blocks: &[RpcBlock<E>]) -> Self {
let hash = B::batch_attempt_hash(blocks);
Attempt { peer_id, hash }
Attempt { peers, hash }
}
pub fn block_peer(&self) -> PeerId {
self.peers.block()
}
}
impl<E: EthSpec> std::fmt::Debug for BatchState<E> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
BatchState::Processing(Attempt {
ref peer_id,
hash: _,
}) => write!(f, "Processing({})", peer_id),
BatchState::AwaitingValidation(Attempt {
ref peer_id,
hash: _,
}) => write!(f, "AwaitingValidation({})", peer_id),
BatchState::Processing(Attempt { ref peers, hash: _ }) => {
write!(f, "Processing({})", peers.block())
}
BatchState::AwaitingValidation(Attempt { ref peers, hash: _ }) => {
write!(f, "AwaitingValidation({})", peers.block())
}
BatchState::AwaitingDownload => f.write_str("AwaitingDownload"),
BatchState::Failed => f.write_str("Failed"),
BatchState::AwaitingProcessing(ref peer, ref blocks, _) => {
write!(f, "AwaitingProcessing({}, {} blocks)", peer, blocks.len())
BatchState::AwaitingProcessing(_, ref blocks, _) => {
write!(f, "AwaitingProcessing({} blocks)", blocks.len())
}
BatchState::Downloading(request_id) => {
write!(f, "Downloading({})", request_id)