mirror of
https://github.com/sigp/lighthouse.git
synced 2026-03-06 18:21:45 +00:00
Handle processing results of non faulty batches (#3439)
## Issue Addressed Solves #3390 So after checking some logs @pawanjay176 got, we conclude that this happened because we blacklisted a chain after trying it "too much". Now here, in all occurrences it seems that "too much" means we got too many download failures. This happened very slowly, exactly because the batch is allowed to stay alive for very long times after not counting penalties when the ee is offline. The error here then was not that the batch failed because of offline ee errors, but that we blacklisted a chain because of download errors, which we can't pin on the chain but on the peer. This PR fixes that. ## Proposed Changes Adds a missing piece of logic so that if a chain fails for errors that can't be attributed to an objectively bad behavior from the peer, it is not blacklisted. The issue at hand occurred when new peers arrived claiming a head that had wrongfully blacklisted, even if the original peers participating in the chain were not penalized. Another notable change is that we need to consider a batch invalid if it processed correctly but its next non empty batch fails processing. Now since a batch can fail processing in non empty ways, there is no need to mark as invalid previous batches. Improves some logging as well. ## Additional Info We should do this regardless of pausing sync on ee offline/unsynced state. This is because I think it's almost impossible to ensure a processing result will reach in a predictable order with a synced notification from the ee. Doing this handles what I think are inevitable data races when we actually pause sync This also fixes a return that reports which batch failed and caused us some confusion checking the logs
This commit is contained in:
@@ -70,12 +70,16 @@ impl BatchConfig for RangeSyncBatchConfig {
|
||||
// Such errors should never be encountered.
|
||||
pub struct WrongState(pub(crate) String);
|
||||
|
||||
/// Auxiliary type alias for readability.
|
||||
type IsFailed = bool;
|
||||
/// After batch operations, we use this to communicate whether a batch can continue or not
|
||||
pub enum BatchOperationOutcome {
|
||||
Continue,
|
||||
Failed { blacklist: bool },
|
||||
}
|
||||
|
||||
pub enum BatchProcessingResult {
|
||||
Success,
|
||||
Failed { count_attempt: bool },
|
||||
FaultyFailure,
|
||||
NonFaultyFailure,
|
||||
}
|
||||
|
||||
/// A segment of a chain.
|
||||
@@ -87,7 +91,7 @@ pub struct BatchInfo<T: EthSpec, B: BatchConfig = RangeSyncBatchConfig> {
|
||||
/// The `Attempts` that have been made and failed to send us this batch.
|
||||
failed_processing_attempts: Vec<Attempt>,
|
||||
/// Number of processing attempts that have failed but we do not count.
|
||||
other_failed_processing_attempts: u8,
|
||||
non_faulty_processing_attempts: u8,
|
||||
/// The number of download retries this batch has undergone due to a failed request.
|
||||
failed_download_attempts: Vec<PeerId>,
|
||||
/// State of the batch.
|
||||
@@ -124,14 +128,6 @@ impl<T: EthSpec> BatchState<T> {
|
||||
pub fn poison(&mut self) -> BatchState<T> {
|
||||
std::mem::replace(self, BatchState::Poisoned)
|
||||
}
|
||||
|
||||
pub fn is_failed(&self) -> IsFailed {
|
||||
match self {
|
||||
BatchState::Failed => true,
|
||||
BatchState::Poisoned => unreachable!("Poisoned batch"),
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: EthSpec, B: BatchConfig> BatchInfo<T, B> {
|
||||
@@ -151,7 +147,7 @@ impl<T: EthSpec, B: BatchConfig> BatchInfo<T, B> {
|
||||
end_slot,
|
||||
failed_processing_attempts: Vec::new(),
|
||||
failed_download_attempts: Vec::new(),
|
||||
other_failed_processing_attempts: 0,
|
||||
non_faulty_processing_attempts: 0,
|
||||
state: BatchState::AwaitingDownload,
|
||||
marker: std::marker::PhantomData,
|
||||
}
|
||||
@@ -175,7 +171,16 @@ impl<T: EthSpec, B: BatchConfig> BatchInfo<T, B> {
|
||||
peers
|
||||
}
|
||||
|
||||
/// Verifies if an incomming block belongs to this batch.
|
||||
/// Return the number of times this batch has failed downloading and failed processing, in this
|
||||
/// order.
|
||||
pub fn failed_attempts(&self) -> (usize, usize) {
|
||||
(
|
||||
self.failed_download_attempts.len(),
|
||||
self.failed_processing_attempts.len(),
|
||||
)
|
||||
}
|
||||
|
||||
/// Verifies if an incoming block belongs to this batch.
|
||||
pub fn is_expecting_block(&self, peer_id: &PeerId, request_id: &Id) -> bool {
|
||||
if let BatchState::Downloading(expected_peer, _, expected_id) = &self.state {
|
||||
return peer_id == expected_peer && expected_id == request_id;
|
||||
@@ -203,6 +208,20 @@ impl<T: EthSpec, B: BatchConfig> BatchInfo<T, B> {
|
||||
}
|
||||
}
|
||||
|
||||
/// After different operations over a batch, this could be in a state that allows it to
|
||||
/// continue, or in failed state. When the batch has failed, we check if it did mainly due to
|
||||
/// processing failures. In this case the batch is considered failed and faulty.
|
||||
pub fn outcome(&self) -> BatchOperationOutcome {
|
||||
match self.state {
|
||||
BatchState::Poisoned => unreachable!("Poisoned batch"),
|
||||
BatchState::Failed => BatchOperationOutcome::Failed {
|
||||
blacklist: self.failed_processing_attempts.len()
|
||||
> self.failed_download_attempts.len(),
|
||||
},
|
||||
_ => BatchOperationOutcome::Continue,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn state(&self) -> &BatchState<T> {
|
||||
&self.state
|
||||
}
|
||||
@@ -235,7 +254,10 @@ impl<T: EthSpec, B: BatchConfig> BatchInfo<T, B> {
|
||||
#[must_use = "Batch may have failed"]
|
||||
pub fn download_completed(
|
||||
&mut self,
|
||||
) -> Result<usize /* Received blocks */, Result<(Slot, Slot, IsFailed), WrongState>> {
|
||||
) -> Result<
|
||||
usize, /* Received blocks */
|
||||
Result<(Slot, Slot, BatchOperationOutcome), WrongState>,
|
||||
> {
|
||||
match self.state.poison() {
|
||||
BatchState::Downloading(peer, blocks, _request_id) => {
|
||||
// verify that blocks are in range
|
||||
@@ -264,7 +286,7 @@ impl<T: EthSpec, B: BatchConfig> BatchInfo<T, B> {
|
||||
BatchState::AwaitingDownload
|
||||
};
|
||||
|
||||
return Err(Ok((expected, received, self.state.is_failed())));
|
||||
return Err(Ok((expected, received, self.outcome())));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -289,7 +311,10 @@ impl<T: EthSpec, B: BatchConfig> BatchInfo<T, B> {
|
||||
/// THe `mark_failed` parameter, when set to false, does not increment the failed attempts of
|
||||
/// this batch and register the peer, rather attempts a re-download.
|
||||
#[must_use = "Batch may have failed"]
|
||||
pub fn download_failed(&mut self, mark_failed: bool) -> Result<IsFailed, WrongState> {
|
||||
pub fn download_failed(
|
||||
&mut self,
|
||||
mark_failed: bool,
|
||||
) -> Result<BatchOperationOutcome, WrongState> {
|
||||
match self.state.poison() {
|
||||
BatchState::Downloading(peer, _, _request_id) => {
|
||||
// register the attempt and check if the batch can be tried again
|
||||
@@ -304,7 +329,7 @@ impl<T: EthSpec, B: BatchConfig> BatchInfo<T, B> {
|
||||
// drop the blocks
|
||||
BatchState::AwaitingDownload
|
||||
};
|
||||
Ok(self.state.is_failed())
|
||||
Ok(self.outcome())
|
||||
}
|
||||
BatchState::Poisoned => unreachable!("Poisoned batch"),
|
||||
other => {
|
||||
@@ -359,32 +384,31 @@ impl<T: EthSpec, B: BatchConfig> BatchInfo<T, B> {
|
||||
pub fn processing_completed(
|
||||
&mut self,
|
||||
procesing_result: BatchProcessingResult,
|
||||
) -> Result<IsFailed, WrongState> {
|
||||
) -> Result<BatchOperationOutcome, WrongState> {
|
||||
match self.state.poison() {
|
||||
BatchState::Processing(attempt) => {
|
||||
self.state = match procesing_result {
|
||||
BatchProcessingResult::Success => BatchState::AwaitingValidation(attempt),
|
||||
BatchProcessingResult::Failed { count_attempt } => {
|
||||
if count_attempt {
|
||||
// register the failed attempt
|
||||
self.failed_processing_attempts.push(attempt);
|
||||
BatchProcessingResult::FaultyFailure => {
|
||||
// register the failed attempt
|
||||
self.failed_processing_attempts.push(attempt);
|
||||
|
||||
// check if the batch can be downloaded again
|
||||
if self.failed_processing_attempts.len()
|
||||
>= B::max_batch_processing_attempts() as usize
|
||||
{
|
||||
BatchState::Failed
|
||||
} else {
|
||||
BatchState::AwaitingDownload
|
||||
}
|
||||
// check if the batch can be downloaded again
|
||||
if self.failed_processing_attempts.len()
|
||||
>= B::max_batch_processing_attempts() as usize
|
||||
{
|
||||
BatchState::Failed
|
||||
} else {
|
||||
self.other_failed_processing_attempts =
|
||||
self.other_failed_processing_attempts.saturating_add(1);
|
||||
BatchState::AwaitingDownload
|
||||
}
|
||||
}
|
||||
BatchProcessingResult::NonFaultyFailure => {
|
||||
self.non_faulty_processing_attempts =
|
||||
self.non_faulty_processing_attempts.saturating_add(1);
|
||||
BatchState::AwaitingDownload
|
||||
}
|
||||
};
|
||||
Ok(self.state.is_failed())
|
||||
Ok(self.outcome())
|
||||
}
|
||||
BatchState::Poisoned => unreachable!("Poisoned batch"),
|
||||
other => {
|
||||
@@ -398,7 +422,7 @@ impl<T: EthSpec, B: BatchConfig> BatchInfo<T, B> {
|
||||
}
|
||||
|
||||
#[must_use = "Batch may have failed"]
|
||||
pub fn validation_failed(&mut self) -> Result<IsFailed, WrongState> {
|
||||
pub fn validation_failed(&mut self) -> Result<BatchOperationOutcome, WrongState> {
|
||||
match self.state.poison() {
|
||||
BatchState::AwaitingValidation(attempt) => {
|
||||
self.failed_processing_attempts.push(attempt);
|
||||
@@ -411,7 +435,7 @@ impl<T: EthSpec, B: BatchConfig> BatchInfo<T, B> {
|
||||
} else {
|
||||
BatchState::AwaitingDownload
|
||||
};
|
||||
Ok(self.state.is_failed())
|
||||
Ok(self.outcome())
|
||||
}
|
||||
BatchState::Poisoned => unreachable!("Poisoned batch"),
|
||||
other => {
|
||||
@@ -472,10 +496,7 @@ impl<T: EthSpec, B: BatchConfig> slog::KV for BatchInfo<T, B> {
|
||||
)?;
|
||||
serializer.emit_usize("downloaded", self.failed_download_attempts.len())?;
|
||||
serializer.emit_usize("processed", self.failed_processing_attempts.len())?;
|
||||
serializer.emit_u8(
|
||||
"processed_no_penalty",
|
||||
self.other_failed_processing_attempts,
|
||||
)?;
|
||||
serializer.emit_u8("processed_no_penalty", self.non_faulty_processing_attempts)?;
|
||||
serializer.emit_arguments("state", &format_args!("{:?}", self.state))?;
|
||||
slog::Result::Ok(())
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user