Reset payload statuses when resuming fork choice (#3498)

## Issue Addressed

NA

## Proposed Changes

This PR is motivated by a recent consensus failure in Geth where it returned `INVALID` for an `VALID` block. Without this PR, the only way to recover is by re-syncing Lighthouse. Whilst ELs "shouldn't have consensus failures", in reality it's something that we can expect from time to time due to the complex nature of Ethereum. Being able to recover easily will help the network recover and EL devs to troubleshoot.

The risk introduced with this PR is that genuinely INVALID payloads get a "second chance" at being imported. I believe the DoS risk here is negligible since LH needs to be restarted in order to re-process the payload. Furthermore, there's no reason to think that a well-performing EL will accept a truly invalid payload the second-time-around.

## Additional Info

This implementation has the following intricacies:

1. Instead of just resetting *invalid* payloads to optimistic, we'll also reset *valid* payloads. This is an artifact of our existing implementation.
1. We will only reset payload statuses when we detect an invalid payload present in `proto_array`
    - This helps save us from forgetting that all our blocks are valid in the "best case scenario" where there are no invalid blocks.
1. If we fail to revert the payload statuses we'll log a `CRIT` and just continue with a `proto_array` that *does not* have reverted payload statuses.
    - The code to revert statuses needs to deal with balances and proposer-boost, so it's a failure point. This is a defensive measure to avoid introducing new show-stopping bugs to LH.
This commit is contained in:
Paul Hauner
2022-08-29 14:34:41 +00:00
parent 2ce86a0830
commit 8609cced0e
13 changed files with 161 additions and 14 deletions

View File

@@ -58,7 +58,7 @@ use execution_layer::{
};
use fork_choice::{
AttestationFromBlock, ExecutionStatus, ForkChoice, ForkchoiceUpdateParameters,
InvalidationOperation, PayloadVerificationStatus,
InvalidationOperation, PayloadVerificationStatus, ResetPayloadStatuses,
};
use futures::channel::mpsc::Sender;
use itertools::process_results;
@@ -432,7 +432,9 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
/// Load fork choice from disk, returning `None` if it isn't found.
pub fn load_fork_choice(
store: BeaconStore<T>,
reset_payload_statuses: ResetPayloadStatuses,
spec: &ChainSpec,
log: &Logger,
) -> Result<Option<BeaconForkChoice<T>>, Error> {
let persisted_fork_choice =
match store.get_item::<PersistedForkChoice>(&FORK_CHOICE_DB_KEY)? {
@@ -445,8 +447,10 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
Ok(Some(ForkChoice::from_persisted(
persisted_fork_choice.fork_choice,
reset_payload_statuses,
fc_store,
spec,
log,
)?))
}
@@ -2925,10 +2929,15 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
// Since the write failed, try to revert the canonical head back to what was stored
// in the database. This attempts to prevent inconsistency between the database and
// fork choice.
if let Err(e) =
self.canonical_head
.restore_from_store(fork_choice, &self.store, &self.spec)
{
if let Err(e) = self.canonical_head.restore_from_store(
fork_choice,
ResetPayloadStatuses::always_reset_conditionally(
self.config.always_reset_payload_statuses,
),
&self.store,
&self.spec,
&self.log,
) {
crit!(
self.log,
"No stored fork choice found to restore from";

View File

@@ -17,7 +17,7 @@ use crate::{
};
use eth1::Config as Eth1Config;
use execution_layer::ExecutionLayer;
use fork_choice::ForkChoice;
use fork_choice::{ForkChoice, ResetPayloadStatuses};
use futures::channel::mpsc::Sender;
use operation_pool::{OperationPool, PersistedOperationPool};
use parking_lot::RwLock;
@@ -245,7 +245,11 @@ where
let fork_choice =
BeaconChain::<Witness<TSlotClock, TEth1Backend, _, _, _>>::load_fork_choice(
store.clone(),
ResetPayloadStatuses::always_reset_conditionally(
self.chain_config.always_reset_payload_statuses,
),
&self.spec,
log,
)
.map_err(|e| format!("Unable to load fork choice from disk: {:?}", e))?
.ok_or("Fork choice not found in store")?;

View File

@@ -43,7 +43,9 @@ use crate::{
BeaconChain, BeaconChainError as Error, BeaconChainTypes, BeaconSnapshot,
};
use eth2::types::{EventKind, SseChainReorg, SseFinalizedCheckpoint, SseHead, SseLateHead};
use fork_choice::{ExecutionStatus, ForkChoiceView, ForkchoiceUpdateParameters, ProtoBlock};
use fork_choice::{
ExecutionStatus, ForkChoiceView, ForkchoiceUpdateParameters, ProtoBlock, ResetPayloadStatuses,
};
use itertools::process_results;
use parking_lot::{Mutex, RwLock, RwLockReadGuard, RwLockWriteGuard};
use slog::{crit, debug, error, warn, Logger};
@@ -249,11 +251,14 @@ impl<T: BeaconChainTypes> CanonicalHead<T> {
// and it needs to be dropped to prevent a dead-lock. Requiring it to be passed here is
// defensive programming.
mut fork_choice_write_lock: RwLockWriteGuard<BeaconForkChoice<T>>,
reset_payload_statuses: ResetPayloadStatuses,
store: &BeaconStore<T>,
spec: &ChainSpec,
log: &Logger,
) -> Result<(), Error> {
let fork_choice = <BeaconChain<T>>::load_fork_choice(store.clone(), spec)?
.ok_or(Error::MissingPersistedForkChoice)?;
let fork_choice =
<BeaconChain<T>>::load_fork_choice(store.clone(), reset_payload_statuses, spec, log)?
.ok_or(Error::MissingPersistedForkChoice)?;
let fork_choice_view = fork_choice.cached_fork_choice_view();
let beacon_block_root = fork_choice_view.head_block_root;
let beacon_block = store

View File

@@ -34,7 +34,12 @@ pub struct ChainConfig {
pub builder_fallback_epochs_since_finalization: usize,
/// Whether any chain health checks should be considered when deciding whether to use the builder API.
pub builder_fallback_disable_checks: bool,
/// When set to `true`, weigh the "unrealized" FFG progression when choosing a head in fork
/// choice.
pub count_unrealized: bool,
/// When set to `true`, forget any valid/invalid/optimistic statuses in fork choice during start
/// up.
pub always_reset_payload_statuses: bool,
/// Whether to apply paranoid checks to blocks proposed by this beacon node.
pub paranoid_block_proposal: bool,
}
@@ -54,6 +59,7 @@ impl Default for ChainConfig {
builder_fallback_epochs_since_finalization: 3,
builder_fallback_disable_checks: false,
count_unrealized: true,
always_reset_payload_statuses: false,
paranoid_block_proposal: false,
}
}

View File

@@ -786,4 +786,12 @@ pub fn cli_app<'a, 'b>() -> App<'a, 'b> {
.takes_value(true)
.default_value("true")
)
.arg(
Arg::with_name("reset-payload-statuses")
.long("reset-payload-statuses")
.help("When present, Lighthouse will forget the payload statuses of any \
already-imported blocks. This can assist in the recovery from a consensus \
failure caused by the execution layer.")
.takes_value(false)
)
}

View File

@@ -644,6 +644,9 @@ pub fn get_config<E: EthSpec>(
client_config.chain.count_unrealized =
clap_utils::parse_required(cli_args, "count-unrealized")?;
client_config.chain.always_reset_payload_statuses =
cli_args.is_present("reset-payload-statuses");
client_config.chain.paranoid_block_proposal = cli_args.is_present("paranoid-block-proposal");
/*