Avoid peer penalties on internal errors for batch block import (#2898)

## Issue Addressed

NA

## Proposed Changes

I've observed some Prater nodes (and potentially some mainnet nodes) banning peers due to validator pubkey cache lock timeouts. For the `BeaconChainError`-type of errors, they're caused by internal faults and we can't necessarily tell if the peer is bad or not. I think this is causing us to ban peers unnecessarily when running on under-resourced machines.

## Additional Info

NA
This commit is contained in:
Paul Hauner
2022-01-11 05:33:28 +00:00
parent 6976796162
commit 4848e53155
4 changed files with 156 additions and 46 deletions

View File

@@ -7,7 +7,7 @@ use crate::sync::{BatchProcessResult, ChainId};
use beacon_chain::{
BeaconChainError, BeaconChainTypes, BlockError, ChainSegmentResult, HistoricalBlockError,
};
use lighthouse_network::PeerId;
use lighthouse_network::{PeerAction, PeerId};
use slog::{crit, debug, error, info, trace, warn};
use tokio::sync::mpsc;
use types::{Epoch, Hash256, SignedBeaconBlock};
@@ -23,6 +23,14 @@ pub enum ProcessId {
ParentLookup(PeerId, Hash256),
}
/// Returned when a chain segment import fails.
struct ChainSegmentFailed {
/// To be displayed in logs.
message: String,
/// Used to penalize peers.
peer_action: Option<PeerAction>,
}
impl<T: BeaconChainTypes> Worker<T> {
/// Attempt to process a block received from a direct RPC request, returning the processing
/// result on the `result_tx` channel.
@@ -123,9 +131,13 @@ impl<T: BeaconChainTypes> Worker<T> {
"chain" => chain_id,
"last_block_slot" => end_slot,
"imported_blocks" => imported_blocks,
"error" => e,
"error" => %e.message,
"service" => "sync");
BatchProcessResult::Failed(imported_blocks > 0)
BatchProcessResult::Failed {
imported_blocks: imported_blocks > 0,
peer_action: e.peer_action,
}
}
};
@@ -154,9 +166,12 @@ impl<T: BeaconChainTypes> Worker<T> {
"batch_epoch" => epoch,
"first_block_slot" => start_slot,
"last_block_slot" => end_slot,
"error" => e,
"error" => %e.message,
"service" => "sync");
BatchProcessResult::Failed(false)
BatchProcessResult::Failed {
imported_blocks: false,
peer_action: e.peer_action,
}
}
};
@@ -175,7 +190,7 @@ impl<T: BeaconChainTypes> Worker<T> {
// reverse
match self.process_blocks(downloaded_blocks.iter().rev()) {
(_, Err(e)) => {
debug!(self.log, "Parent lookup failed"; "last_peer_id" => %peer_id, "error" => e);
debug!(self.log, "Parent lookup failed"; "last_peer_id" => %peer_id, "error" => %e.message);
self.send_sync_message(SyncMessage::ParentLookupFailed {
peer_id,
chain_head,
@@ -193,7 +208,7 @@ impl<T: BeaconChainTypes> Worker<T> {
fn process_blocks<'a>(
&self,
downloaded_blocks: impl Iterator<Item = &'a SignedBeaconBlock<T::EthSpec>>,
) -> (usize, Result<(), String>) {
) -> (usize, Result<(), ChainSegmentFailed>) {
let blocks = downloaded_blocks.cloned().collect::<Vec<_>>();
match self.chain.process_chain_segment(blocks) {
ChainSegmentResult::Successful { imported_blocks } => {
@@ -223,7 +238,7 @@ impl<T: BeaconChainTypes> Worker<T> {
fn process_backfill_blocks(
&self,
blocks: &[SignedBeaconBlock<T::EthSpec>],
) -> (usize, Result<(), String>) {
) -> (usize, Result<(), ChainSegmentFailed>) {
match self.chain.import_historical_block_batch(blocks) {
Ok(imported_blocks) => {
metrics::inc_counter(
@@ -250,7 +265,12 @@ impl<T: BeaconChainTypes> Worker<T> {
"block_root" => ?block_root,
"expected_root" => ?expected_block_root
);
String::from("mismatched_block_root")
ChainSegmentFailed {
message: String::from("mismatched_block_root"),
// The peer is faulty if they send blocks with bad roots.
peer_action: Some(PeerAction::LowToleranceError),
}
}
HistoricalBlockError::InvalidSignature
| HistoricalBlockError::SignatureSet(_) => {
@@ -259,7 +279,12 @@ impl<T: BeaconChainTypes> Worker<T> {
"Backfill batch processing error";
"error" => ?e
);
"invalid_signature".into()
ChainSegmentFailed {
message: "invalid_signature".into(),
// The peer is faulty if they bad signatures.
peer_action: Some(PeerAction::LowToleranceError),
}
}
HistoricalBlockError::ValidatorPubkeyCacheTimeout => {
warn!(
@@ -267,25 +292,55 @@ impl<T: BeaconChainTypes> Worker<T> {
"Backfill batch processing error";
"error" => "pubkey_cache_timeout"
);
"pubkey_cache_timeout".into()
ChainSegmentFailed {
message: "pubkey_cache_timeout".into(),
// This is an internal error, do not penalize the peer.
peer_action: None,
}
}
HistoricalBlockError::NoAnchorInfo => {
warn!(self.log, "Backfill not required");
String::from("no_anchor_info")
ChainSegmentFailed {
message: String::from("no_anchor_info"),
// There is no need to do a historical sync, this is not a fault of
// the peer.
peer_action: None,
}
}
HistoricalBlockError::IndexOutOfBounds
| HistoricalBlockError::BlockOutOfRange { .. } => {
HistoricalBlockError::IndexOutOfBounds => {
error!(
self.log,
"Backfill batch processing error";
"Backfill batch OOB error";
"error" => ?e,
);
String::from("logic_error")
ChainSegmentFailed {
message: String::from("logic_error"),
// This should never occur, don't penalize the peer.
peer_action: None,
}
}
HistoricalBlockError::BlockOutOfRange { .. } => {
error!(
self.log,
"Backfill batch error";
"error" => ?e,
);
ChainSegmentFailed {
message: String::from("unexpected_error"),
// This should never occur, don't penalize the peer.
peer_action: None,
}
}
},
other => {
warn!(self.log, "Backfill batch processing error"; "error" => ?other);
format!("{:?}", other)
ChainSegmentFailed {
message: format!("{:?}", other),
// This is an internal error, don't penalize the peer.
peer_action: None,
}
}
};
(0, Err(err))
@@ -312,15 +367,18 @@ impl<T: BeaconChainTypes> Worker<T> {
}
/// Helper function to handle a `BlockError` from `process_chain_segment`
fn handle_failed_chain_segment(&self, error: BlockError<T::EthSpec>) -> Result<(), String> {
fn handle_failed_chain_segment(
&self,
error: BlockError<T::EthSpec>,
) -> Result<(), ChainSegmentFailed> {
match error {
BlockError::ParentUnknown(block) => {
// blocks should be sequential and all parents should exist
Err(format!(
"Block has an unknown parent: {}",
block.parent_root()
))
Err(ChainSegmentFailed {
message: format!("Block has an unknown parent: {}", block.parent_root()),
// Peers are faulty if they send non-sequential blocks.
peer_action: Some(PeerAction::LowToleranceError),
})
}
BlockError::BlockIsAlreadyKnown => {
// This can happen for many reasons. Head sync's can download multiples and parent
@@ -350,10 +408,14 @@ impl<T: BeaconChainTypes> Worker<T> {
);
}
Err(format!(
"Block with slot {} is higher than the current slot {}",
block_slot, present_slot
))
Err(ChainSegmentFailed {
message: format!(
"Block with slot {} is higher than the current slot {}",
block_slot, present_slot
),
// Peers are faulty if they send blocks from the future.
peer_action: Some(PeerAction::LowToleranceError),
})
}
BlockError::WouldRevertFinalizedSlot { .. } => {
debug!(self.log, "Finalized or earlier block processed";);
@@ -370,7 +432,11 @@ impl<T: BeaconChainTypes> Worker<T> {
"outcome" => ?e,
);
Err(format!("Internal error whilst processing block: {:?}", e))
Err(ChainSegmentFailed {
message: format!("Internal error whilst processing block: {:?}", e),
// Do not penalize peers for internal errors.
peer_action: None,
})
}
other => {
debug!(
@@ -379,7 +445,11 @@ impl<T: BeaconChainTypes> Worker<T> {
"outcome" => %other,
);
Err(format!("Peer sent invalid block. Reason: {:?}", other))
Err(ChainSegmentFailed {
message: format!("Peer sent invalid block. Reason: {:?}", other),
// Do not penalize peers for internal errors.
peer_action: None,
})
}
}
}