Avoid peer penalties on internal errors for batch block import (#2898)

## Issue Addressed

NA

## Proposed Changes

I've observed some Prater nodes (and potentially some mainnet nodes) banning peers due to validator pubkey cache lock timeouts. For the `BeaconChainError`-type of errors, they're caused by internal faults and we can't necessarily tell if the peer is bad or not. I think this is causing us to ban peers unnecessarily when running on under-resourced machines.

## Additional Info

NA
This commit is contained in:
Paul Hauner
2022-01-11 05:33:28 +00:00
parent 6976796162
commit 4848e53155
4 changed files with 156 additions and 46 deletions

View File

@@ -541,7 +541,15 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
// blocks to continue, and the chain is expecting a processing result that won't
// arrive. To mitigate this, (fake) fail this processing so that the batch is
// re-downloaded.
self.on_batch_process_result(network, batch_id, &BatchProcessResult::Failed(false))
self.on_batch_process_result(
network,
batch_id,
&BatchProcessResult::Failed {
imported_blocks: false,
// The beacon processor queue is full, no need to penalize the peer.
peer_action: None,
},
)
} else {
Ok(ProcessResult::Successful)
}
@@ -621,7 +629,10 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
self.process_completed_batches(network)
}
}
BatchProcessResult::Failed(imported_blocks) => {
BatchProcessResult::Failed {
imported_blocks,
peer_action,
} => {
let batch = match self.batches.get_mut(&batch_id) {
Some(v) => v,
None => {
@@ -659,12 +670,20 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
// that it is likely all peers are sending invalid batches
// repeatedly and are either malicious or faulty. We stop the backfill sync and
// report all synced peers that have participated.
let action = PeerAction::LowToleranceError;
warn!(self.log, "Backfill batch failed to download. Penalizing peers";
"score_adjustment" => %action,
"batch_epoch"=> batch_id);
for peer in self.participating_peers.drain() {
network.report_peer(peer, action);
warn!(
self.log,
"Backfill batch failed to download. Penalizing peers";
"score_adjustment" => %peer_action
.as_ref()
.map(ToString::to_string)
.unwrap_or_else(|| "None".into()),
"batch_epoch"=> batch_id
);
if let Some(peer_action) = peer_action {
for peer in self.participating_peers.drain() {
network.report_peer(peer, *peer_action);
}
}
self.fail_sync(BackFillError::BatchProcessingFailed(batch_id))
.map(|_| ProcessResult::Successful)

View File

@@ -137,7 +137,10 @@ pub enum BatchProcessResult {
/// The batch was completed successfully. It carries whether the sent batch contained blocks.
Success(bool),
/// The batch processing failed. It carries whether the processing imported any block.
Failed(bool),
Failed {
imported_blocks: bool,
peer_action: Option<PeerAction>,
},
}
/// Maintains a sequential list of parents to lookup and the lookup's current state.

View File

@@ -313,7 +313,14 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// blocks to continue, and the chain is expecting a processing result that won't
// arrive. To mitigate this, (fake) fail this processing so that the batch is
// re-downloaded.
self.on_batch_process_result(network, batch_id, &BatchProcessResult::Failed(false))
self.on_batch_process_result(
network,
batch_id,
&BatchProcessResult::Failed {
imported_blocks: false,
peer_action: None,
},
)
} else {
Ok(KeepChain)
}
@@ -488,7 +495,10 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
self.process_completed_batches(network)
}
}
BatchProcessResult::Failed(imported_blocks) => {
BatchProcessResult::Failed {
imported_blocks,
peer_action,
} => {
let batch = self.batches.get_mut(&batch_id).ok_or_else(|| {
RemoveChain::WrongChainState(format!(
"Batch not found for current processing target {}",
@@ -511,12 +521,20 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// report all peers.
// There are some edge cases with forks that could land us in this situation.
// This should be unlikely, so we tolerate these errors, but not often.
let action = PeerAction::LowToleranceError;
warn!(self.log, "Batch failed to download. Dropping chain scoring peers";
"score_adjustment" => %action,
"batch_epoch"=> batch_id);
for (peer, _) in self.peers.drain() {
network.report_peer(peer, action);
warn!(
self.log,
"Batch failed to download. Dropping chain scoring peers";
"score_adjustment" => %peer_action
.as_ref()
.map(ToString::to_string)
.unwrap_or_else(|| "None".into()),
"batch_epoch"=> batch_id
);
if let Some(peer_action) = peer_action {
for (peer, _) in self.peers.drain() {
network.report_peer(peer, *peer_action);
}
}
Err(RemoveChain::ChainFailed(batch_id))
} else {