Handle processing results of non faulty batches (#3439)

## Issue Addressed Solves #3390 So after checking some logs @pawanjay176 got, we conclude that this happened because we blacklisted a chain after trying it "too much". Now here, in all occurrences it seems that "too much" means we got too many download failures. This happened very slowly, exactly because the batch is allowed to stay alive for very long times after not counting penalties when the ee is offline. The error here then was not that the batch failed because of offline ee errors, but that we blacklisted a chain because of download errors, which we can't pin on the chain but on the peer. This PR fixes that. ## Proposed Changes Adds a missing piece of logic so that if a chain fails for errors that can't be attributed to an objectively bad behavior from the peer, it is not blacklisted. The issue at hand occurred when new peers arrived claiming a head that had wrongfully blacklisted, even if the original peers participating in the chain were not penalized. Another notable change is that we need to consider a batch invalid if it processed correctly but its next non empty batch fails processing. Now since a batch can fail processing in non empty ways, there is no need to mark as invalid previous batches. Improves some logging as well. ## Additional Info We should do this regardless of pausing sync on ee offline/unsynced state. This is because I think it's almost impossible to ensure a processing result will reach in a predictable order with a synced notification from the ee. Doing this handles what I think are inevitable data races when we actually pause sync This also fixes a return that reports which batch failed and caused us some confusion checking the logs
2026-03-11 18:04:18 +00:00 · 2022-08-12 00:56:38 +00:00
parent a476ae4907
commit f4ffa9e0b4
12 changed files with 298 additions and 274 deletions
--- a/beacon_node/network/src/sync/range_sync/chain.rs
+++ b/beacon_node/network/src/sync/range_sync/chain.rs
@@ -1,7 +1,8 @@
 use super::batch::{BatchInfo, BatchProcessingResult, BatchState};
-use crate::beacon_processor::WorkEvent as BeaconWorkEvent;
-use crate::beacon_processor::{ChainSegmentProcessId, FailureMode};
-use crate::sync::{manager::Id, network_context::SyncNetworkContext, BatchProcessResult};
+use crate::beacon_processor::{ChainSegmentProcessId, WorkEvent as BeaconWorkEvent};
+use crate::sync::{
+    manager::Id, network_context::SyncNetworkContext, BatchOperationOutcome, BatchProcessResult,
+};
 use beacon_chain::{BeaconChainTypes, CountUnrealized};
 use fnv::FnvHashMap;
 use lighthouse_network::{PeerAction, PeerId};
@@ -37,7 +38,11 @@ pub type ProcessingResult = Result<KeepChain, RemoveChain>;
 pub enum RemoveChain {
    EmptyPeerPool,
    ChainCompleted,
-    ChainFailed(BatchId),
+    /// A chain has failed. This boolean signals whether the chain should be blacklisted.
+    ChainFailed {
+        blacklist: bool,
+        failing_batch: BatchId,
+    },
    WrongBatchState(String),
    WrongChainState(String),
 }
@@ -187,8 +192,13 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
            // fail the batches
            for id in batch_ids {
                if let Some(batch) = self.batches.get_mut(&id) {
-                    if batch.download_failed(true)? {
-                        return Err(RemoveChain::ChainFailed(id));
+                    if let BatchOperationOutcome::Failed { blacklist } =
+                        batch.download_failed(true)?
+                    {
+                        return Err(RemoveChain::ChainFailed {
+                            blacklist,
+                            failing_batch: id,
+                        });
                    }
                    self.retry_batch_download(network, id)?;
                } else {
@@ -265,12 +275,15 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
                    self.process_completed_batches(network)
                }
                Err(result) => {
-                    let (expected_boundary, received_boundary, is_failed) = result?;
+                    let (expected_boundary, received_boundary, outcome) = result?;
                    warn!(self.log, "Batch received out of range blocks"; "expected_boundary" => expected_boundary, "received_boundary" => received_boundary,
                        "peer_id" => %peer_id, batch);

-                    if is_failed {
-                        return Err(RemoveChain::ChainFailed(batch_id));
+                    if let BatchOperationOutcome::Failed { blacklist } = outcome {
+                        return Err(RemoveChain::ChainFailed {
+                            blacklist,
+                            failing_batch: batch_id,
+                        });
                    }
                    // this batch can't be used, so we need to request it again.
                    self.retry_batch_download(network, batch_id)
@@ -324,15 +337,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
            // blocks to continue, and the chain is expecting a processing result that won't
            // arrive.  To mitigate this, (fake) fail this processing so that the batch is
            // re-downloaded.
-            self.on_batch_process_result(
-                network,
-                batch_id,
-                &BatchProcessResult::Failed {
-                    imported_blocks: false,
-                    peer_action: None,
-                    mode: FailureMode::ConsensusLayer,
-                },
-            )
+            self.on_batch_process_result(network, batch_id, &BatchProcessResult::NonFaultyFailure)
        } else {
            Ok(KeepChain)
        }
@@ -448,7 +453,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
    ) -> ProcessingResult {
        // the first two cases are possible if the chain advances while waiting for a processing
        // result
-        match &self.current_processing_batch {
+        let batch = match &self.current_processing_batch {
            Some(processing_id) if *processing_id != batch_id => {
                debug!(self.log, "Unexpected batch result";
                    "batch_epoch" => batch_id, "expected_batch_epoch" => processing_id);
@@ -462,22 +467,35 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
            _ => {
                // batch_id matches, continue
                self.current_processing_batch = None;
-            }
-        }
-
-        match result {
-            BatchProcessResult::Success(was_non_empty) => {
-                let batch = self.batches.get_mut(&batch_id).ok_or_else(|| {
+                self.batches.get_mut(&batch_id).ok_or_else(|| {
                    RemoveChain::WrongChainState(format!(
                        "Current processing batch not found: {}",
                        batch_id
                    ))
-                })?;
+                })?
+            }
+        };

+        let peer = batch.current_peer().cloned().ok_or_else(|| {
+            RemoveChain::WrongBatchState(format!(
+                "Processing target is in wrong state: {:?}",
+                batch.state(),
+            ))
+        })?;
+
+        // Log the process result and the batch for debugging purposes.
+        debug!(self.log, "Batch processing result"; "result" => ?result, &batch,
+            "batch_epoch" => batch_id, "client" => %network.client_type(&peer));
+
+        // We consider three cases. Batch was successfully processed, Batch failed processing due
+        // to a faulty peer, or batch failed processing but the peer can't be deemed faulty.
+        match result {
+            BatchProcessResult::Success { was_non_empty } => {
                batch.processing_completed(BatchProcessingResult::Success)?;
-                // If the processed batch was not empty, we can validate previous unvalidated
-                // blocks.
+
                if *was_non_empty {
+                    // If the processed batch was not empty, we can validate previous unvalidated
+                    // blocks.
                    self.advance_chain(network, batch_id);
                    // we register so that on chain switching we don't try it again
                    self.attempted_optimistic_starts.insert(batch_id);
@@ -507,64 +525,56 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
                    self.process_completed_batches(network)
                }
            }
-            BatchProcessResult::Failed {
+            BatchProcessResult::FaultyFailure {
                imported_blocks,
-                peer_action,
-                mode: _,
+                penalty,
            } => {
-                let batch = self.batches.get_mut(&batch_id).ok_or_else(|| {
-                    RemoveChain::WrongChainState(format!(
-                        "Batch not found for current processing target {}",
-                        batch_id
-                    ))
-                })?;
-                let peer = batch.current_peer().cloned().ok_or_else(|| {
-                    RemoveChain::WrongBatchState(format!(
-                        "Processing target is in wrong state: {:?}",
-                        batch.state(),
-                    ))
-                })?;
-                debug!(self.log, "Batch processing failed"; "imported_blocks" => imported_blocks, "peer_penalty" => ?peer_action,
-                    "batch_epoch" => batch_id, "peer" => %peer, "client" => %network.client_type(&peer));
+                // Penalize the peer appropiately.
+                network.report_peer(peer, *penalty, "faulty_batch");

-                if batch.processing_completed(BatchProcessingResult::Failed {
-                    count_attempt: peer_action.is_some(),
-                })? {
-                    // check that we have not exceeded the re-process retry counter
-                    // If a batch has exceeded the invalid batch lookup attempts limit, it means
-                    // that it is likely all peers in this chain are are sending invalid batches
-                    // repeatedly and are either malicious or faulty. We drop the chain and
-                    // report all peers.
-                    // There are some edge cases with forks that could land us in this situation.
-                    // This should be unlikely, so we tolerate these errors, but not often.
-                    warn!(
-                        self.log,
-                        "Batch failed to download. Dropping chain scoring peers";
-                        "score_adjustment" => %peer_action
-                            .as_ref()
-                            .map(ToString::to_string)
-                            .unwrap_or_else(|| "None".into()),
-                        "batch_epoch"=> batch_id
-                    );
-
-                    if let Some(peer_action) = peer_action {
-                        for (peer, _) in self.peers.drain() {
-                            network.report_peer(peer, *peer_action, "batch_failed");
+                // Check if this batch is allowed to continue
+                match batch.processing_completed(BatchProcessingResult::FaultyFailure)? {
+                    BatchOperationOutcome::Continue => {
+                        // Chain can continue. Check if it can be moved forward.
+                        if *imported_blocks {
+                            // At least one block was successfully verified and imported, so we can be sure all
+                            // previous batches are valid and we only need to download the current failed
+                            // batch.
+                            self.advance_chain(network, batch_id);
                        }
+                        // Handle this invalid batch, that is within the re-process retries limit.
+                        self.handle_invalid_batch(network, batch_id)
                    }
-                    Err(RemoveChain::ChainFailed(batch_id))
-                } else {
-                    // chain can continue. Check if it can be moved forward
-                    if *imported_blocks {
-                        // At least one block was successfully verified and imported, so we can be sure all
-                        // previous batches are valid and we only need to download the current failed
-                        // batch.
-                        self.advance_chain(network, batch_id);
+                    BatchOperationOutcome::Failed { blacklist } => {
+                        // Check that we have not exceeded the re-process retry counter,
+                        // If a batch has exceeded the invalid batch lookup attempts limit, it means
+                        // that it is likely all peers in this chain are are sending invalid batches
+                        // repeatedly and are either malicious or faulty. We drop the chain and
+                        // report all peers.
+                        // There are some edge cases with forks that could land us in this situation.
+                        // This should be unlikely, so we tolerate these errors, but not often.
+                        warn!(
+                            self.log,
+                            "Batch failed to download. Dropping chain scoring peers";
+                            "score_adjustment" => %penalty,
+                            "batch_epoch"=> batch_id,
+                        );
+
+                        for (peer, _) in self.peers.drain() {
+                            network.report_peer(peer, *penalty, "faulty_chain");
+                        }
+                        Err(RemoveChain::ChainFailed {
+                            blacklist,
+                            failing_batch: batch_id,
+                        })
                    }
-                    // Handle this invalid batch, that is within the re-process retries limit.
-                    self.handle_invalid_batch(network, batch_id)
                }
            }
+            BatchProcessResult::NonFaultyFailure => {
+                batch.processing_completed(BatchProcessingResult::NonFaultyFailure)?;
+                // Simply redownload the batch.
+                self.retry_batch_download(network, batch_id)
+            }
        }
    }

@@ -737,9 +747,12 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
        let mut redownload_queue = Vec::new();

        for (id, batch) in self.batches.range_mut(..batch_id) {
-            if batch.validation_failed()? {
+            if let BatchOperationOutcome::Failed { blacklist } = batch.validation_failed()? {
                // remove the chain early
-                return Err(RemoveChain::ChainFailed(batch_id));
+                return Err(RemoveChain::ChainFailed {
+                    blacklist,
+                    failing_batch: *id,
+                });
            }
            redownload_queue.push(*id);
        }
@@ -836,8 +849,11 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
            if let Some(active_requests) = self.peers.get_mut(peer_id) {
                active_requests.remove(&batch_id);
            }
-            if batch.download_failed(true)? {
-                return Err(RemoveChain::ChainFailed(batch_id));
+            if let BatchOperationOutcome::Failed { blacklist } = batch.download_failed(true)? {
+                return Err(RemoveChain::ChainFailed {
+                    blacklist,
+                    failing_batch: batch_id,
+                });
            }
            self.retry_batch_download(network, batch_id)
        } else {
@@ -925,10 +941,16 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
                    self.peers
                        .get_mut(&peer)
                        .map(|request| request.remove(&batch_id));
-                    if batch.download_failed(true)? {
-                        return Err(RemoveChain::ChainFailed(batch_id));
-                    } else {
-                        return self.retry_batch_download(network, batch_id);
+                    match batch.download_failed(true)? {
+                        BatchOperationOutcome::Failed { blacklist } => {
+                            return Err(RemoveChain::ChainFailed {
+                                blacklist,
+                                failing_batch: batch_id,
+                            })
+                        }
+                        BatchOperationOutcome::Continue => {
+                            return self.retry_batch_download(network, batch_id)
+                        }
                    }
                }
            }