Fix PeerDAS sync scoring (#7352)

* Remove request tracking inside syncing chains * Prioritize by range peers in network context * Prioritize custody peers for columns by range * Explicit error handling of the no peers error case * Remove good_peers_on_sampling_subnets * Count AwaitingDownload towards the buffer limit * Retry syncing chains in AwaitingDownload state * Use same peer priorization for lookups * Review PR * Address TODOs * Revert changes to peer erroring in range sync * Revert metrics changes * Update comment * Pass peers_to_deprioritize to select_columns_by_range_peers_to_request * more idiomatic * Idiomatic while * Add note about infinite loop * Use while let * Fix wrong custody column count for lookup blocks * Remove impl * Remove stale comment * Fix build errors. * Or default * Review PR * BatchPeerGroup * Match block and blob signatures * Explicit match statement to BlockError in range sync * Remove todo in BatchPeerGroup * Remove participating peers from backfill sync * Remove MissingAllCustodyColumns error * Merge fixes * Clean up PR * Consistent naming of batch_peers * Address multiple review comments * Better errors for das * Penalize column peers once * Restore fn * Fix error enum * Removed MismatchedPublicKeyLen * Revert testing changes * Change BlockAndCustodyColumns enum variant * Revert type change in import_historical_block_batch * Drop pubkey cache * Don't collect Vec * Classify errors * Remove ReconstructColumnsError * More detailed UnrequestedSlot error * Lint test * Fix slot conversion * Reduce penalty for missing blobs * Revert changes in peer selection * Lint tests * Rename block matching functions * Reorder block matching in historical blocks * Fix order of block matching * Add store tests * Filter blockchain in assert_correct_historical_block_chain * Also filter before KZG checks * Lint tests * Fix lint * Fix fulu err assertion * Check point is not at infinity * Fix ws sync test * Revert dropping filter fn --------- Co-authored-by: Jimmy Chen <jchen.tc@gmail.com> Co-authored-by: Jimmy Chen <jimmy@sigmaprime.io> Co-authored-by: Pawan Dhananjay <pawandhananjay@gmail.com>
2026-03-17 11:52:42 +00:00 · 2025-05-21 08:06:42 -05:00
parent f06d1d0346
commit b014675b7a
27 changed files with 1103 additions and 654 deletions
--- a/beacon_node/network/src/sync/range_sync/chain.rs
+++ b/beacon_node/network/src/sync/range_sync/chain.rs
@@ -1,4 +1,4 @@
-use super::batch::{BatchInfo, BatchProcessingResult, BatchState};
+use super::batch::{BatchInfo, BatchPeers, BatchProcessingResult, BatchState};
 use super::RangeSyncType;
 use crate::metrics;
 use crate::network_beacon_processor::ChainSegmentProcessId;
@@ -6,6 +6,7 @@ use crate::sync::network_context::{RangeRequestId, RpcRequestSendError, RpcRespo
 use crate::sync::{network_context::SyncNetworkContext, BatchOperationOutcome, BatchProcessResult};
 use beacon_chain::block_verification_types::RpcBlock;
 use beacon_chain::BeaconChainTypes;
+use itertools::Itertools;
 use lighthouse_network::service::api_types::Id;
 use lighthouse_network::{PeerAction, PeerId};
 use logging::crit;
@@ -216,7 +217,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
        &mut self,
        network: &mut SyncNetworkContext<T>,
        batch_id: BatchId,
-        peer_id: &PeerId,
+        batch_peers: BatchPeers,
        request_id: Id,
        blocks: Vec<RpcBlock<T::EthSpec>>,
    ) -> ProcessingResult {
@@ -244,8 +245,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
        // A stream termination has been sent. This batch has ended. Process a completed batch.
        // Remove the request from the peer's active batches

-        // TODO(das): should use peer group here https://github.com/sigp/lighthouse/issues/6258
-        let received = batch.download_completed(blocks, *peer_id)?;
+        let received = batch.download_completed(blocks, batch_peers)?;
        let awaiting_batches = batch_id
            .saturating_sub(self.optimistic_start.unwrap_or(self.processing_target))
            / EPOCHS_PER_BATCH;
@@ -447,7 +447,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
            }
        };

-        let peer = batch.processing_peer().cloned().ok_or_else(|| {
+        let batch_peers = batch.processing_peers().ok_or_else(|| {
            RemoveChain::WrongBatchState(format!(
                "Processing target is in wrong state: {:?}",
                batch.state(),
@@ -458,7 +458,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
        debug!(
            result = ?result,
            batch_epoch = %batch_id,
-            client = %network.client_type(&peer),
            batch_state = ?batch_state,
            ?batch,
            "Batch processing result"
@@ -521,10 +520,30 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
            }
            BatchProcessResult::FaultyFailure {
                imported_blocks,
-                penalty,
+                peer_action,
+                // TODO(sync): propagate error in logs
+                error: _,
            } => {
-                // Penalize the peer appropiately.
-                network.report_peer(peer, *penalty, "faulty_batch");
+                // TODO(sync): De-dup between back and forwards sync
+                if let Some(penalty) = peer_action.block_peer {
+                    // Penalize the peer appropiately.
+                    network.report_peer(batch_peers.block(), penalty, "faulty_batch");
+                }
+
+                // Penalize each peer only once. Currently a peer_action does not mix different
+                // PeerAction levels.
+                for (peer, penalty) in peer_action
+                    .column_peer
+                    .iter()
+                    .filter_map(|(column_index, penalty)| {
+                        batch_peers
+                            .column(column_index)
+                            .map(|peer| (*peer, *penalty))
+                    })
+                    .unique()
+                {
+                    network.report_peer(peer, penalty, "faulty_batch_column");
+                }

                // Check if this batch is allowed to continue
                match batch.processing_completed(BatchProcessingResult::FaultyFailure)? {
@@ -540,6 +559,11 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
                        self.handle_invalid_batch(network, batch_id)
                    }
                    BatchOperationOutcome::Failed { blacklist } => {
+                        // TODO(das): what peer action should we apply to the rest of
+                        // peers? Say a batch repeatedly fails because a custody peer is not
+                        // sending us its custody columns
+                        let penalty = PeerAction::LowToleranceError;
+
                        // Check that we have not exceeded the re-process retry counter,
                        // If a batch has exceeded the invalid batch lookup attempts limit, it means
                        // that it is likely all peers in this chain are are sending invalid batches
@@ -554,7 +578,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
                        );

                        for peer in self.peers.drain() {
-                            network.report_peer(peer, *penalty, "faulty_chain");
+                            network.report_peer(peer, penalty, "faulty_chain");
                        }
                        Err(RemoveChain::ChainFailed {
                            blacklist,
@@ -633,17 +657,20 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
                        // The validated batch has been re-processed
                        if attempt.hash != processed_attempt.hash {
                            // The re-downloaded version was different
-                            if processed_attempt.peer_id != attempt.peer_id {
+                            // TODO(das): should penalize other peers?
+                            let valid_attempt_peer = processed_attempt.block_peer();
+                            let bad_attempt_peer = attempt.block_peer();
+                            if valid_attempt_peer != bad_attempt_peer {
                                // A different peer sent the correct batch, the previous peer did not
                                // We negatively score the original peer.
                                let action = PeerAction::LowToleranceError;
                                debug!(
                                    batch_epoch = %id, score_adjustment = %action,
-                                    original_peer = %attempt.peer_id, new_peer = %processed_attempt.peer_id,
+                                    original_peer = %bad_attempt_peer, new_peer = %valid_attempt_peer,
                                    "Re-processed batch validated. Scoring original peer"
                                );
                                network.report_peer(
-                                    attempt.peer_id,
+                                    bad_attempt_peer,
                                    action,
                                    "batch_reprocessed_original_peer",
                                );
@@ -654,12 +681,12 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
                                debug!(
                                    batch_epoch = %id,
                                    score_adjustment = %action,
-                                    original_peer = %attempt.peer_id,
-                                    new_peer = %processed_attempt.peer_id,
+                                    original_peer = %bad_attempt_peer,
+                                    new_peer = %valid_attempt_peer,
                                    "Re-processed batch validated by the same peer"
                                );
                                network.report_peer(
-                                    attempt.peer_id,
+                                    bad_attempt_peer,
                                    action,
                                    "batch_reprocessed_same_peer",
                                );
@@ -888,8 +915,8 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
    ) -> ProcessingResult {
        let batch_state = self.visualize_batch_state();
        if let Some(batch) = self.batches.get_mut(&batch_id) {
-            let (request, batch_type) = batch.to_blocks_by_range_request();
-            let failed_peers = batch.failed_peers();
+            let request = batch.to_blocks_by_range_request();
+            let failed_peers = batch.failed_block_peers();

            // TODO(das): we should request only from peers that are part of this SyncingChain.
            // However, then we hit the NoPeer error frequently which causes the batch to fail and
@@ -903,7 +930,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
                .collect::<HashSet<_>>();

            match network.block_components_by_range_request(
-                batch_type,
                request,
                RangeRequestId::RangeSync {
                    chain_id: self.id,
@@ -999,8 +1025,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
            }

            if let Entry::Vacant(entry) = self.batches.entry(epoch) {
-                let batch_type = network.batch_type(epoch);
-                let optimistic_batch = BatchInfo::new(&epoch, EPOCHS_PER_BATCH, batch_type);
+                let optimistic_batch = BatchInfo::new(&epoch, EPOCHS_PER_BATCH);
                entry.insert(optimistic_batch);
                self.send_batch(network, epoch)?;
            }
@@ -1101,8 +1126,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
                self.include_next_batch(network)
            }
            Entry::Vacant(entry) => {
-                let batch_type = network.batch_type(next_batch_id);
-                entry.insert(BatchInfo::new(&next_batch_id, EPOCHS_PER_BATCH, batch_type));
+                entry.insert(BatchInfo::new(&next_batch_id, EPOCHS_PER_BATCH));
                self.to_be_downloaded += EPOCHS_PER_BATCH;
                Some(next_batch_id)
            }