Make re-org strat more cautious and add more config (#4151)

## Proposed Changes

This change attempts to prevent failed re-orgs by:

1. Lowering the re-org cutoff from 2s to 1s. This is informed by a failed re-org attempted by @yorickdowne's node. The failed block was requested in the 1.5-2s window due to a Vouch failure, and failed to propagate to the majority of the network before the attestation deadline at 4s.
2. Allow users to adjust their re-org cutoff depending on observed network conditions and their risk profile. The static 2 second cutoff was too rigid.
3. Add a `--proposer-reorg-disallowed-offsets` flag which can be used to prohibit reorgs at certain slots. This is intended to help workaround an issue whereby reorging blocks at slot 1 are currently taking ~1.6s to propagate on gossip rather than ~500ms. This is suspected to be due to a cache miss in current versions of Prysm, which should be fixed in their next release.

## Additional Info

I'm of two minds about removing the `shuffling_stable` check which checks for blocks at slot 0 in the epoch. If we removed it users would be able to configure Lighthouse to try reorging at slot 0, which likely wouldn't work very well due to interactions with the proposer index cache. I think we could leave it for now and revisit it later.
This commit is contained in:
Michael Sproul
2023-04-13 07:05:01 +00:00
parent 00cf5fc184
commit b90c0c3fb1
12 changed files with 218 additions and 18 deletions

View File

@@ -106,7 +106,6 @@ use task_executor::{ShutdownReason, TaskExecutor};
use tokio_stream::Stream;
use tree_hash::TreeHash;
use types::beacon_state::CloneConfig;
use types::consts::merge::INTERVALS_PER_SLOT;
use types::*;
pub type ForkChoiceError = fork_choice::Error<crate::ForkChoiceStoreError>;
@@ -128,12 +127,6 @@ pub const VALIDATOR_PUBKEY_CACHE_LOCK_TIMEOUT: Duration = Duration::from_secs(1)
/// The timeout for the eth1 finalization cache
pub const ETH1_FINALIZATION_CACHE_LOCK_TIMEOUT: Duration = Duration::from_millis(200);
/// The latest delay from the start of the slot at which to attempt a 1-slot re-org.
fn max_re_org_slot_delay(seconds_per_slot: u64) -> Duration {
// Allow at least half of the attestation deadline for the block to propagate.
Duration::from_secs(seconds_per_slot) / INTERVALS_PER_SLOT as u32 / 2
}
// These keys are all zero because they get stored in different columns, see `DBColumn` type.
pub const BEACON_CHAIN_DB_KEY: Hash256 = Hash256::zero();
pub const OP_POOL_DB_KEY: Hash256 = Hash256::zero();
@@ -3761,7 +3754,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
// 1. It seems we have time to propagate and still receive the proposer boost.
// 2. The current head block was seen late.
// 3. The `get_proposer_head` conditions from fork choice pass.
let proposing_on_time = slot_delay < max_re_org_slot_delay(self.spec.seconds_per_slot);
let proposing_on_time = slot_delay < self.config.re_org_cutoff(self.spec.seconds_per_slot);
if !proposing_on_time {
debug!(
self.log,
@@ -3791,6 +3784,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
slot,
canonical_head,
re_org_threshold,
&self.config.re_org_disallowed_offsets,
self.config.re_org_max_epochs_since_finalization,
)
.map_err(|e| match e {
@@ -4069,6 +4063,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
.get_preliminary_proposer_head(
head_block_root,
re_org_threshold,
&self.config.re_org_disallowed_offsets,
self.config.re_org_max_epochs_since_finalization,
)
.map_err(|e| e.map_inner_error(Error::ProposerHeadForkChoiceError))?;
@@ -4079,7 +4074,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
let re_org_block_slot = head_slot + 1;
let fork_choice_slot = info.current_slot;
// If a re-orging proposal isn't made by the `max_re_org_slot_delay` then we give up
// If a re-orging proposal isn't made by the `re_org_cutoff` then we give up
// and allow the fork choice update for the canonical head through so that we may attest
// correctly.
let current_slot_ok = if head_slot == fork_choice_slot {
@@ -4090,7 +4085,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
.and_then(|slot_start| {
let now = self.slot_clock.now_duration()?;
let slot_delay = now.saturating_sub(slot_start);
Some(slot_delay <= max_re_org_slot_delay(self.spec.seconds_per_slot))
Some(slot_delay <= self.config.re_org_cutoff(self.spec.seconds_per_slot))
})
.unwrap_or(false)
} else {