Fix PeerDAS sync scoring (#7352)

* Remove request tracking inside syncing chains

* Prioritize by range peers in network context

* Prioritize custody peers for columns by range

* Explicit error handling of the no peers error case

* Remove good_peers_on_sampling_subnets

* Count AwaitingDownload towards the buffer limit

* Retry syncing chains in AwaitingDownload state

* Use same peer priorization for lookups

* Review PR

* Address TODOs

* Revert changes to peer erroring in range sync

* Revert metrics changes

* Update comment

* Pass peers_to_deprioritize to select_columns_by_range_peers_to_request

* more idiomatic

* Idiomatic while

* Add note about infinite loop

* Use while let

* Fix wrong custody column count for lookup blocks

* Remove impl

* Remove stale comment

* Fix build errors.

* Or default

* Review PR

* BatchPeerGroup

* Match block and blob signatures

* Explicit match statement to BlockError in range sync

* Remove todo in BatchPeerGroup

* Remove participating peers from backfill sync

* Remove MissingAllCustodyColumns error

* Merge fixes

* Clean up PR

* Consistent naming of batch_peers

* Address multiple review comments

* Better errors for das

* Penalize column peers once

* Restore fn

* Fix error enum

* Removed MismatchedPublicKeyLen

* Revert testing changes

* Change BlockAndCustodyColumns enum variant

* Revert type change in import_historical_block_batch

* Drop pubkey cache

* Don't collect Vec

* Classify errors

* Remove ReconstructColumnsError

* More detailed UnrequestedSlot error

* Lint test

* Fix slot conversion

* Reduce penalty for missing blobs

* Revert changes in peer selection

* Lint tests

* Rename block matching functions

* Reorder block matching in historical blocks

* Fix order of block matching

* Add store tests

* Filter blockchain in assert_correct_historical_block_chain

* Also filter before KZG checks

* Lint tests

* Fix lint

* Fix fulu err assertion

* Check point is not at infinity

* Fix ws sync test

* Revert dropping filter fn

---------

Co-authored-by: Jimmy Chen <jchen.tc@gmail.com>
Co-authored-by: Jimmy Chen <jimmy@sigmaprime.io>
Co-authored-by: Pawan Dhananjay <pawandhananjay@gmail.com>
This commit is contained in:
Lion - dapplion
2025-05-21 08:06:42 -05:00
committed by GitHub
parent f06d1d0346
commit b014675b7a
27 changed files with 1103 additions and 654 deletions

View File

@@ -1,4 +1,4 @@
use super::batch::{BatchInfo, BatchProcessingResult, BatchState};
use super::batch::{BatchInfo, BatchPeers, BatchProcessingResult, BatchState};
use super::RangeSyncType;
use crate::metrics;
use crate::network_beacon_processor::ChainSegmentProcessId;
@@ -6,6 +6,7 @@ use crate::sync::network_context::{RangeRequestId, RpcRequestSendError, RpcRespo
use crate::sync::{network_context::SyncNetworkContext, BatchOperationOutcome, BatchProcessResult};
use beacon_chain::block_verification_types::RpcBlock;
use beacon_chain::BeaconChainTypes;
use itertools::Itertools;
use lighthouse_network::service::api_types::Id;
use lighthouse_network::{PeerAction, PeerId};
use logging::crit;
@@ -216,7 +217,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
&mut self,
network: &mut SyncNetworkContext<T>,
batch_id: BatchId,
peer_id: &PeerId,
batch_peers: BatchPeers,
request_id: Id,
blocks: Vec<RpcBlock<T::EthSpec>>,
) -> ProcessingResult {
@@ -244,8 +245,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// A stream termination has been sent. This batch has ended. Process a completed batch.
// Remove the request from the peer's active batches
// TODO(das): should use peer group here https://github.com/sigp/lighthouse/issues/6258
let received = batch.download_completed(blocks, *peer_id)?;
let received = batch.download_completed(blocks, batch_peers)?;
let awaiting_batches = batch_id
.saturating_sub(self.optimistic_start.unwrap_or(self.processing_target))
/ EPOCHS_PER_BATCH;
@@ -447,7 +447,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
}
};
let peer = batch.processing_peer().cloned().ok_or_else(|| {
let batch_peers = batch.processing_peers().ok_or_else(|| {
RemoveChain::WrongBatchState(format!(
"Processing target is in wrong state: {:?}",
batch.state(),
@@ -458,7 +458,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
debug!(
result = ?result,
batch_epoch = %batch_id,
client = %network.client_type(&peer),
batch_state = ?batch_state,
?batch,
"Batch processing result"
@@ -521,10 +520,30 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
}
BatchProcessResult::FaultyFailure {
imported_blocks,
penalty,
peer_action,
// TODO(sync): propagate error in logs
error: _,
} => {
// Penalize the peer appropiately.
network.report_peer(peer, *penalty, "faulty_batch");
// TODO(sync): De-dup between back and forwards sync
if let Some(penalty) = peer_action.block_peer {
// Penalize the peer appropiately.
network.report_peer(batch_peers.block(), penalty, "faulty_batch");
}
// Penalize each peer only once. Currently a peer_action does not mix different
// PeerAction levels.
for (peer, penalty) in peer_action
.column_peer
.iter()
.filter_map(|(column_index, penalty)| {
batch_peers
.column(column_index)
.map(|peer| (*peer, *penalty))
})
.unique()
{
network.report_peer(peer, penalty, "faulty_batch_column");
}
// Check if this batch is allowed to continue
match batch.processing_completed(BatchProcessingResult::FaultyFailure)? {
@@ -540,6 +559,11 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
self.handle_invalid_batch(network, batch_id)
}
BatchOperationOutcome::Failed { blacklist } => {
// TODO(das): what peer action should we apply to the rest of
// peers? Say a batch repeatedly fails because a custody peer is not
// sending us its custody columns
let penalty = PeerAction::LowToleranceError;
// Check that we have not exceeded the re-process retry counter,
// If a batch has exceeded the invalid batch lookup attempts limit, it means
// that it is likely all peers in this chain are are sending invalid batches
@@ -554,7 +578,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
);
for peer in self.peers.drain() {
network.report_peer(peer, *penalty, "faulty_chain");
network.report_peer(peer, penalty, "faulty_chain");
}
Err(RemoveChain::ChainFailed {
blacklist,
@@ -633,17 +657,20 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// The validated batch has been re-processed
if attempt.hash != processed_attempt.hash {
// The re-downloaded version was different
if processed_attempt.peer_id != attempt.peer_id {
// TODO(das): should penalize other peers?
let valid_attempt_peer = processed_attempt.block_peer();
let bad_attempt_peer = attempt.block_peer();
if valid_attempt_peer != bad_attempt_peer {
// A different peer sent the correct batch, the previous peer did not
// We negatively score the original peer.
let action = PeerAction::LowToleranceError;
debug!(
batch_epoch = %id, score_adjustment = %action,
original_peer = %attempt.peer_id, new_peer = %processed_attempt.peer_id,
original_peer = %bad_attempt_peer, new_peer = %valid_attempt_peer,
"Re-processed batch validated. Scoring original peer"
);
network.report_peer(
attempt.peer_id,
bad_attempt_peer,
action,
"batch_reprocessed_original_peer",
);
@@ -654,12 +681,12 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
debug!(
batch_epoch = %id,
score_adjustment = %action,
original_peer = %attempt.peer_id,
new_peer = %processed_attempt.peer_id,
original_peer = %bad_attempt_peer,
new_peer = %valid_attempt_peer,
"Re-processed batch validated by the same peer"
);
network.report_peer(
attempt.peer_id,
bad_attempt_peer,
action,
"batch_reprocessed_same_peer",
);
@@ -888,8 +915,8 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
) -> ProcessingResult {
let batch_state = self.visualize_batch_state();
if let Some(batch) = self.batches.get_mut(&batch_id) {
let (request, batch_type) = batch.to_blocks_by_range_request();
let failed_peers = batch.failed_peers();
let request = batch.to_blocks_by_range_request();
let failed_peers = batch.failed_block_peers();
// TODO(das): we should request only from peers that are part of this SyncingChain.
// However, then we hit the NoPeer error frequently which causes the batch to fail and
@@ -903,7 +930,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
.collect::<HashSet<_>>();
match network.block_components_by_range_request(
batch_type,
request,
RangeRequestId::RangeSync {
chain_id: self.id,
@@ -999,8 +1025,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
}
if let Entry::Vacant(entry) = self.batches.entry(epoch) {
let batch_type = network.batch_type(epoch);
let optimistic_batch = BatchInfo::new(&epoch, EPOCHS_PER_BATCH, batch_type);
let optimistic_batch = BatchInfo::new(&epoch, EPOCHS_PER_BATCH);
entry.insert(optimistic_batch);
self.send_batch(network, epoch)?;
}
@@ -1101,8 +1126,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
self.include_next_batch(network)
}
Entry::Vacant(entry) => {
let batch_type = network.batch_type(next_batch_id);
entry.insert(BatchInfo::new(&next_batch_id, EPOCHS_PER_BATCH, batch_type));
entry.insert(BatchInfo::new(&next_batch_id, EPOCHS_PER_BATCH));
self.to_be_downloaded += EPOCHS_PER_BATCH;
Some(next_batch_id)
}