diff --git a/beacon_node/beacon_chain/src/builder.rs b/beacon_node/beacon_chain/src/builder.rs index 4c82c93ba3..2c1dae9215 100644 --- a/beacon_node/beacon_chain/src/builder.rs +++ b/beacon_node/beacon_chain/src/builder.rs @@ -7,7 +7,6 @@ use crate::beacon_proposer_cache::BeaconProposerCache; use crate::custody_context::NodeCustodyType; use crate::data_availability_checker::DataAvailabilityChecker; use crate::fork_choice_signal::ForkChoiceSignalTx; -use crate::fork_revert::{reset_fork_choice_to_finalization, revert_to_fork_boundary}; use crate::graffiti_calculator::{GraffitiCalculator, GraffitiOrigin}; use crate::kzg_utils::{build_data_column_sidecars_fulu, build_data_column_sidecars_gloas}; use crate::light_client_server_cache::LightClientServerCache; @@ -778,49 +777,17 @@ where .get_head(current_slot, &self.spec) .map_err(|e| format!("Unable to get fork choice head: {:?}", e))?; - // Try to decode the head block according to the current fork, if that fails, try - // to backtrack to before the most recent fork. - let (head_block_root, head_block, head_reverted) = - match store.get_full_block(&initial_head_block_root) { - Ok(Some(block)) => (initial_head_block_root, block, false), - Ok(None) => return Err("Head block not found in store".into()), - Err(StoreError::SszDecodeError(_)) => { - error!( - message = "This node has likely missed a hard fork. \ - It will try to revert the invalid blocks and keep running, \ - but any stray blocks and states will not be deleted. \ - Long-term you should consider re-syncing this node.", - "Error decoding head block" - ); - let (block_root, block) = revert_to_fork_boundary( - current_slot, - initial_head_block_root, - store.clone(), - &self.spec, - )?; - - (block_root, block, true) - } - Err(e) => return Err(descriptive_db_error("head block", &e)), - }; + let head_block_root = initial_head_block_root; + let head_block = store + .get_full_block(&initial_head_block_root) + .map_err(|e| descriptive_db_error("head block", &e))? + .ok_or("Head block not found in store")?; let (_head_state_root, head_state) = store .get_advanced_hot_state(head_block_root, current_slot, head_block.state_root()) .map_err(|e| descriptive_db_error("head state", &e))? .ok_or("Head state not found in store")?; - // If the head reverted then we need to reset fork choice using the new head's finalized - // checkpoint. - if head_reverted { - fork_choice = reset_fork_choice_to_finalization( - head_block_root, - &head_state, - store.clone(), - Some(current_slot), - &self.spec, - )?; - } - let head_shuffling_ids = BlockShufflingIds::try_from_head(head_block_root, &head_state)?; let mut head_snapshot = BeaconSnapshot { diff --git a/beacon_node/beacon_chain/src/fork_revert.rs b/beacon_node/beacon_chain/src/fork_revert.rs deleted file mode 100644 index 986924bfc8..0000000000 --- a/beacon_node/beacon_chain/src/fork_revert.rs +++ /dev/null @@ -1,96 +0,0 @@ -use crate::BeaconForkChoiceStore; -use fork_choice::ForkChoice; -use itertools::process_results; -use std::sync::Arc; -use store::{HotColdDB, ItemStore, iter::ParentRootBlockIterator}; -use tracing::{info, warn}; -use types::{BeaconState, ChainSpec, EthSpec, ForkName, Hash256, SignedBeaconBlock, Slot}; - -const CORRUPT_DB_MESSAGE: &str = "The database could be corrupt. Check its file permissions or \ - consider deleting it by running with the --purge-db flag."; - -/// Revert the head to the last block before the most recent hard fork. -/// -/// This function is destructive and should only be used if there is no viable alternative. It will -/// cause the reverted blocks and states to be completely forgotten, lying dormant in the database -/// forever. -/// -/// Return the `(head_block_root, head_block)` that should be used post-reversion. -pub fn revert_to_fork_boundary, Cold: ItemStore>( - current_slot: Slot, - head_block_root: Hash256, - store: Arc>, - spec: &ChainSpec, -) -> Result<(Hash256, SignedBeaconBlock), String> { - let current_fork = spec.fork_name_at_slot::(current_slot); - let fork_epoch = spec - .fork_epoch(current_fork) - .ok_or_else(|| format!("Current fork '{}' never activates", current_fork))?; - - if current_fork == ForkName::Base { - return Err(format!( - "Cannot revert to before phase0 hard fork. {}", - CORRUPT_DB_MESSAGE - )); - } - - warn!( - target_fork = %current_fork, - %fork_epoch, - "Reverting invalid head block" - ); - let block_iter = ParentRootBlockIterator::fork_tolerant(&store, head_block_root); - - let (block_root, blinded_block) = process_results(block_iter, |mut iter| { - iter.find_map(|(block_root, block)| { - if block.slot() < fork_epoch.start_slot(E::slots_per_epoch()) { - Some((block_root, block)) - } else { - info!( - ?block_root, - slot = %block.slot(), - "Reverting block" - ); - None - } - }) - }) - .map_err(|e| { - format!( - "Error fetching blocks to revert: {:?}. {}", - e, CORRUPT_DB_MESSAGE - ) - })? - .ok_or_else(|| format!("No pre-fork blocks found. {}", CORRUPT_DB_MESSAGE))?; - - let block = store - .make_full_block(&block_root, blinded_block) - .map_err(|e| format!("Unable to add payload to new head block: {:?}", e))?; - - Ok((block_root, block)) -} - -/// Reset fork choice to the finalized checkpoint of the supplied head state. -/// -/// The supplied `head_block_root` should correspond to the most recently applied block on -/// `head_state`. -/// -/// This function avoids quirks of fork choice initialization by replaying all of the blocks from -/// the checkpoint to the head. -/// -/// See this issue for details: https://github.com/ethereum/consensus-specs/issues/2566 -/// -/// It will fail if the finalized state or any of the blocks to replay are unavailable. -/// -/// WARNING: this function is destructive and causes fork choice to permanently forget all -/// chains other than the chain leading to `head_block_root`. It should only be used in extreme -/// circumstances when there is no better alternative. -pub fn reset_fork_choice_to_finalization, Cold: ItemStore>( - _head_block_root: Hash256, - _head_state: &BeaconState, - _store: Arc>, - _current_slot: Option, - _spec: &ChainSpec, -) -> Result, E>, String> { - Err("broken".into()) -} diff --git a/beacon_node/beacon_chain/src/lib.rs b/beacon_node/beacon_chain/src/lib.rs index 3b03395a66..e1a190ffb3 100644 --- a/beacon_node/beacon_chain/src/lib.rs +++ b/beacon_node/beacon_chain/src/lib.rs @@ -26,7 +26,6 @@ pub mod events; pub mod execution_payload; pub mod fetch_blobs; pub mod fork_choice_signal; -pub mod fork_revert; pub mod graffiti_calculator; pub mod historical_blocks; pub mod historical_data_columns; diff --git a/beacon_node/beacon_chain/tests/store_tests.rs b/beacon_node/beacon_chain/tests/store_tests.rs index 5b0f0aa388..0cfaac7502 100644 --- a/beacon_node/beacon_chain/tests/store_tests.rs +++ b/beacon_node/beacon_chain/tests/store_tests.rs @@ -3929,188 +3929,6 @@ async fn finalizes_after_resuming_from_db() { ); } -#[allow(clippy::large_stack_frames)] -#[tokio::test] -async fn revert_minority_fork_on_resume() { - let validator_count = 16; - let slots_per_epoch = MinimalEthSpec::slots_per_epoch(); - - let fork_epoch = Epoch::new(4); - let fork_slot = fork_epoch.start_slot(slots_per_epoch); - let initial_blocks = slots_per_epoch * fork_epoch.as_u64() - 1; - let post_fork_blocks = slots_per_epoch * 3; - - let mut spec1 = MinimalEthSpec::default_spec(); - spec1.altair_fork_epoch = None; - let mut spec2 = MinimalEthSpec::default_spec(); - spec2.altair_fork_epoch = Some(fork_epoch); - - let all_validators = (0..validator_count).collect::>(); - - // Chain with no fork epoch configured. - let db_path1 = tempdir().unwrap(); - let store1 = get_store_generic(&db_path1, StoreConfig::default(), spec1.clone()); - let harness1 = BeaconChainHarness::builder(MinimalEthSpec) - .spec(spec1.clone().into()) - .keypairs(KEYPAIRS[0..validator_count].to_vec()) - .fresh_disk_store(store1) - .mock_execution_layer() - .build(); - - // Chain with fork epoch configured. - let db_path2 = tempdir().unwrap(); - let store2 = get_store_generic(&db_path2, StoreConfig::default(), spec2.clone()); - let harness2 = BeaconChainHarness::builder(MinimalEthSpec) - .spec(spec2.clone().into()) - .keypairs(KEYPAIRS[0..validator_count].to_vec()) - .fresh_disk_store(store2) - .mock_execution_layer() - .build(); - - // Apply the same blocks to both chains initially. - let mut state = harness1.get_current_state(); - let mut block_root = harness1.chain.genesis_block_root; - for slot in (1..=initial_blocks).map(Slot::new) { - let state_root = state.update_tree_hash_cache().unwrap(); - - let attestations = harness1.make_attestations( - &all_validators, - &state, - state_root, - block_root.into(), - slot, - ); - harness1.set_current_slot(slot); - harness2.set_current_slot(slot); - harness1.process_attestations(attestations.clone(), &state); - harness2.process_attestations(attestations, &state); - - let ((block, blobs), new_state) = harness1.make_block(state, slot).await; - - harness1 - .process_block(slot, block.canonical_root(), (block.clone(), blobs.clone())) - .await - .unwrap(); - harness2 - .process_block(slot, block.canonical_root(), (block.clone(), blobs.clone())) - .await - .unwrap(); - - state = new_state; - block_root = block.canonical_root(); - } - - assert_eq!(harness1.head_slot(), fork_slot - 1); - assert_eq!(harness2.head_slot(), fork_slot - 1); - - // Fork the two chains. - let mut state1 = state.clone(); - let mut state2 = state.clone(); - - let mut majority_blocks = vec![]; - - for i in 0..post_fork_blocks { - let slot = fork_slot + i; - - // Attestations on majority chain. - let state_root = state.update_tree_hash_cache().unwrap(); - - let attestations = harness2.make_attestations( - &all_validators, - &state2, - state_root, - block_root.into(), - slot, - ); - harness2.set_current_slot(slot); - harness2.process_attestations(attestations, &state2); - - // Minority chain block (no attesters). - let ((block1, blobs1), new_state1) = harness1.make_block(state1, slot).await; - harness1 - .process_block(slot, block1.canonical_root(), (block1, blobs1)) - .await - .unwrap(); - state1 = new_state1; - - // Majority chain block (all attesters). - let ((block2, blobs2), new_state2) = harness2.make_block(state2, slot).await; - harness2 - .process_block(slot, block2.canonical_root(), (block2.clone(), blobs2)) - .await - .unwrap(); - - state2 = new_state2; - block_root = block2.canonical_root(); - - majority_blocks.push(block2); - } - - let end_slot = fork_slot + post_fork_blocks - 1; - assert_eq!(harness1.head_slot(), end_slot); - assert_eq!(harness2.head_slot(), end_slot); - - // Resume from disk with the hard-fork activated: this should revert the post-fork blocks. - // We have to do some hackery with the `slot_clock` so that the correct slot is set when - // the beacon chain builder loads the head block. - drop(harness1); - let resume_store = get_store_generic(&db_path1, StoreConfig::default(), spec2.clone()); - - let resumed_harness = TestHarness::builder(MinimalEthSpec) - .spec(spec2.clone().into()) - .keypairs(KEYPAIRS[0..validator_count].to_vec()) - .resumed_disk_store(resume_store) - .override_store_mutator(Box::new(move |mut builder| { - builder = builder - .resume_from_db() - .unwrap() - .testing_slot_clock(spec2.get_slot_duration()) - .unwrap(); - builder - .get_slot_clock() - .unwrap() - .set_slot(end_slot.as_u64()); - builder - })) - .mock_execution_layer() - .build(); - - // Head should now be just before the fork. - resumed_harness.chain.recompute_head_at_current_slot().await; - assert_eq!(resumed_harness.head_slot(), fork_slot - 1); - - // Fork choice should only know the canonical head. When we reverted the head we also should - // have called `reset_fork_choice_to_finalization` which rebuilds fork choice from scratch - // without the reverted block. - assert_eq!( - resumed_harness.chain.heads(), - vec![(resumed_harness.head_block_root(), fork_slot - 1)] - ); - - // Apply blocks from the majority chain and trigger finalization. - let initial_split_slot = resumed_harness.chain.store.get_split_slot(); - for block in &majority_blocks { - resumed_harness - .process_block_result((block.clone(), None)) - .await - .unwrap(); - - // The canonical head should be the block from the majority chain. - resumed_harness.chain.recompute_head_at_current_slot().await; - assert_eq!(resumed_harness.head_slot(), block.slot()); - assert_eq!(resumed_harness.head_block_root(), block.canonical_root()); - } - let advanced_split_slot = resumed_harness.chain.store.get_split_slot(); - - // Check that the migration ran successfully. - assert!(advanced_split_slot > initial_split_slot); - - // Check that there is only a single head now matching harness2 (the minority chain is gone). - let heads = resumed_harness.chain.heads(); - assert_eq!(heads, harness2.chain.heads()); - assert_eq!(heads.len(), 1); -} - // This test checks whether the schema downgrade from the latest version to some minimum supported // version is correct. This is the easiest schema test to write without historic versions of // Lighthouse on-hand, but has the disadvantage that the min version needs to be adjusted manually diff --git a/beacon_node/network/src/sync/batch.rs b/beacon_node/network/src/sync/batch.rs index 8de386f5be..f9a1fcce39 100644 --- a/beacon_node/network/src/sync/batch.rs +++ b/beacon_node/network/src/sync/batch.rs @@ -213,6 +213,9 @@ impl BatchInfo { /// After different operations over a batch, this could be in a state that allows it to /// continue, or in failed state. When the batch has failed, we check if it did mainly due to /// processing failures. In this case the batch is considered failed and faulty. + /// + /// When failure counts are equal, `blacklist` is `false` — we assume network issues over + /// peer fault when the evidence is ambiguous. pub fn outcome(&self) -> BatchOperationOutcome { match self.state { BatchState::Poisoned => unreachable!("Poisoned batch"), @@ -255,8 +258,10 @@ impl BatchInfo { /// Mark the batch as failed and return whether we can attempt a re-download. /// /// This can happen if a peer disconnects or some error occurred that was not the peers fault. - /// The `peer` parameter, when set to None, does not increment the failed attempts of - /// this batch and register the peer, rather attempts a re-download. + /// The `peer` parameter, when set to `None`, still counts toward + /// `max_batch_download_attempts` (to prevent infinite retries on persistent failures) + /// but does not register a peer in `failed_peers()`. Use + /// [`Self::downloading_to_awaiting_download`] to retry without counting a failed attempt. #[must_use = "Batch may have failed"] pub fn download_failed( &mut self, @@ -272,7 +277,6 @@ impl BatchInfo { { BatchState::Failed } else { - // drop the blocks BatchState::AwaitingDownload }; Ok(self.outcome()) @@ -524,3 +528,196 @@ impl BatchState { } } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::sync::range_sync::RangeSyncBatchConfig; + use types::MinimalEthSpec; + + type Cfg = RangeSyncBatchConfig; + type TestBatch = BatchInfo>; + + fn max_dl() -> u8 { + Cfg::max_batch_download_attempts() + } + + fn max_proc() -> u8 { + Cfg::max_batch_processing_attempts() + } + + fn new_batch() -> TestBatch { + BatchInfo::new(&Epoch::new(0), 1, ByRangeRequestType::Blocks) + } + + fn peer() -> PeerId { + PeerId::random() + } + + fn advance_to_processing(batch: &mut TestBatch, req_id: Id, peer_id: PeerId) { + batch.start_downloading(req_id).unwrap(); + batch.download_completed(vec![1, 2, 3], peer_id).unwrap(); + batch.start_processing().unwrap(); + } + + fn advance_to_awaiting_validation(batch: &mut TestBatch, req_id: Id, peer_id: PeerId) { + advance_to_processing(batch, req_id, peer_id); + batch + .processing_completed(BatchProcessingResult::Success) + .unwrap(); + } + + #[test] + fn happy_path_lifecycle() { + let mut batch = new_batch(); + let p = peer(); + + assert!(matches!(batch.state(), BatchState::AwaitingDownload)); + + batch.start_downloading(1).unwrap(); + assert!(matches!(batch.state(), BatchState::Downloading(1))); + + batch.download_completed(vec![10, 20], p).unwrap(); + assert!(matches!(batch.state(), BatchState::AwaitingProcessing(..))); + + let (data, _duration) = batch.start_processing().unwrap(); + assert_eq!(data, vec![10, 20]); + assert!(matches!(batch.state(), BatchState::Processing(..))); + + let outcome = batch + .processing_completed(BatchProcessingResult::Success) + .unwrap(); + assert!(matches!(outcome, BatchOperationOutcome::Continue)); + assert!(matches!(batch.state(), BatchState::AwaitingValidation(..))); + } + + #[test] + fn download_failures_count_toward_limit() { + let mut batch = new_batch(); + + for i in 1..max_dl() as Id { + batch.start_downloading(i).unwrap(); + let outcome = batch.download_failed(Some(peer())).unwrap(); + assert!(matches!(outcome, BatchOperationOutcome::Continue)); + } + + // Next failure hits the limit + batch.start_downloading(max_dl() as Id).unwrap(); + let outcome = batch.download_failed(Some(peer())).unwrap(); + assert!(matches!( + outcome, + BatchOperationOutcome::Failed { blacklist: false } + )); + } + + #[test] + fn download_failed_none_counts_but_does_not_blame_peer() { + let mut batch = new_batch(); + + // None still counts toward the limit (prevents infinite retry on persistent + // network failures), but doesn't register a peer in failed_peers(). + for i in 0..max_dl() as Id { + batch.start_downloading(i).unwrap(); + batch.download_failed(None).unwrap(); + } + assert!(matches!(batch.state(), BatchState::Failed)); + assert!(batch.failed_peers().is_empty()); + } + + #[test] + fn faulty_processing_failures_count_toward_limit() { + let mut batch = new_batch(); + + for i in 1..max_proc() as Id { + advance_to_processing(&mut batch, i, peer()); + let outcome = batch + .processing_completed(BatchProcessingResult::FaultyFailure) + .unwrap(); + assert!(matches!(outcome, BatchOperationOutcome::Continue)); + } + + // Next faulty failure: limit reached + advance_to_processing(&mut batch, max_proc() as Id, peer()); + let outcome = batch + .processing_completed(BatchProcessingResult::FaultyFailure) + .unwrap(); + assert!(matches!( + outcome, + BatchOperationOutcome::Failed { blacklist: true } + )); + } + + #[test] + fn non_faulty_processing_failures_never_exhaust_batch() { + let mut batch = new_batch(); + + // Well past both limits — non-faulty failures should never cause failure + let iterations = (max_dl() + max_proc()) as Id * 2; + for i in 0..iterations { + advance_to_processing(&mut batch, i, peer()); + let outcome = batch + .processing_completed(BatchProcessingResult::NonFaultyFailure) + .unwrap(); + assert!(matches!(outcome, BatchOperationOutcome::Continue)); + } + // Non-faulty failures also don't register peers as failed + assert!(batch.failed_peers().is_empty()); + } + + #[test] + fn validation_failures_count_toward_processing_limit() { + let mut batch = new_batch(); + + for i in 1..max_proc() as Id { + advance_to_awaiting_validation(&mut batch, i, peer()); + let outcome = batch.validation_failed().unwrap(); + assert!(matches!(outcome, BatchOperationOutcome::Continue)); + } + + advance_to_awaiting_validation(&mut batch, max_proc() as Id, peer()); + let outcome = batch.validation_failed().unwrap(); + assert!(matches!( + outcome, + BatchOperationOutcome::Failed { blacklist: true } + )); + } + + #[test] + fn mixed_failure_types_interact_correctly() { + let mut batch = new_batch(); + let mut req_id: Id = 0; + let mut next_id = || { + req_id += 1; + req_id + }; + + // One download failure + batch.start_downloading(next_id()).unwrap(); + batch.download_failed(Some(peer())).unwrap(); + + // One faulty processing failure (requires a successful download first) + advance_to_processing(&mut batch, next_id(), peer()); + batch + .processing_completed(BatchProcessingResult::FaultyFailure) + .unwrap(); + + // One non-faulty processing failure + advance_to_processing(&mut batch, next_id(), peer()); + batch + .processing_completed(BatchProcessingResult::NonFaultyFailure) + .unwrap(); + assert!(matches!(batch.state(), BatchState::AwaitingDownload)); + + // Fill remaining download failures to hit the limit + for _ in 1..max_dl() { + batch.start_downloading(next_id()).unwrap(); + batch.download_failed(Some(peer())).unwrap(); + } + + // Download failures > processing failures → blacklist: false + assert!(matches!( + batch.outcome(), + BatchOperationOutcome::Failed { blacklist: false } + )); + } +} diff --git a/beacon_node/network/src/sync/range_sync/mod.rs b/beacon_node/network/src/sync/range_sync/mod.rs index dd9f17bfd1..3b65e1c84a 100644 --- a/beacon_node/network/src/sync/range_sync/mod.rs +++ b/beacon_node/network/src/sync/range_sync/mod.rs @@ -5,6 +5,8 @@ mod chain_collection; mod range; mod sync_type; +#[cfg(test)] +pub use chain::RangeSyncBatchConfig; pub use chain::{ChainId, EPOCHS_PER_BATCH}; #[cfg(test)] pub use chain_collection::SyncChainStatus;