keep failed finalized chains to avoid retries (#3142)

## Issue Addressed In very rare occasions we've seen most if not all our peers in a chain with which we don't agree. Purging these peers can take a very long time: number of retries of the chain. Meanwhile sync is caught in a loop trying the chain again and again. This makes it so that we fast track purging peers via registering the failed chain to prevent retrying for some time (30 seconds). Longer times could be dangerous since a chain can fail if a batch fails to download for example. In this case, I think it's still acceptable to fast track purging peers since they are nor providing the required info anyway Co-authored-by: Divma <26765164+divagant-martian@users.noreply.github.com>
2026-06-17 02:38:34 +00:00 · 2022-04-13 01:10:55 +00:00
parent aa72088f8f
commit 7366266bd1
6 changed files with 114 additions and 148 deletions
--- a/beacon_node/network/src/sync/block_lookups/mod.rs
+++ b/beacon_node/network/src/sync/block_lookups/mod.rs
@@ -4,7 +4,7 @@ use std::time::Duration;
 use beacon_chain::{BeaconChainTypes, BlockError};
 use fnv::FnvHashMap;
 use lighthouse_network::{PeerAction, PeerId};
-use lru_cache::LRUCache;
+use lru_cache::LRUTimeCache;
 use slog::{crit, debug, error, trace, warn, Logger};
 use smallvec::SmallVec;
 use store::{Hash256, SignedBeaconBlock};
@@ -29,7 +29,7 @@ mod single_block_lookup;
 #[cfg(test)]
 mod tests;

-const FAILED_CHAINS_CACHE_SIZE: usize = 500;
+const FAILED_CHAINS_CACHE_EXPIRY_SECONDS: u64 = 60;
 const SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS: u8 = 3;

 pub(crate) struct BlockLookups<T: BeaconChainTypes> {
@@ -37,7 +37,7 @@ pub(crate) struct BlockLookups<T: BeaconChainTypes> {
    parent_queue: SmallVec<[ParentLookup<T::EthSpec>; 3]>,

    /// A cache of failed chain lookups to prevent duplicate searches.
-    failed_chains: LRUCache<Hash256>,
+    failed_chains: LRUTimeCache<Hash256>,

    /// A collection of block hashes being searched for and a flag indicating if a result has been
    /// received or not.
@@ -56,7 +56,9 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
    pub fn new(beacon_processor_send: mpsc::Sender<WorkEvent<T>>, log: Logger) -> Self {
        Self {
            parent_queue: Default::default(),
-            failed_chains: LRUCache::new(FAILED_CHAINS_CACHE_SIZE),
+            failed_chains: LRUTimeCache::new(Duration::from_secs(
+                FAILED_CHAINS_CACHE_EXPIRY_SECONDS,
+            )),
            single_block_lookups: Default::default(),
            beacon_processor_send,
            log,
@@ -218,7 +220,7 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
            return;
        };

-        match parent_lookup.verify_block(block, &self.failed_chains) {
+        match parent_lookup.verify_block(block, &mut self.failed_chains) {
            Ok(Some(block)) => {
                // Block is correct, send to the beacon processor.
                let chain_hash = parent_lookup.chain_hash();
--- a/beacon_node/network/src/sync/block_lookups/parent_lookup.rs
+++ b/beacon_node/network/src/sync/block_lookups/parent_lookup.rs
@@ -117,7 +117,7 @@ impl<T: EthSpec> ParentLookup<T> {
    pub fn verify_block(
        &mut self,
        block: Option<Box<SignedBeaconBlock<T>>>,
-        failed_chains: &lru_cache::LRUCache<Hash256>,
+        failed_chains: &mut lru_cache::LRUTimeCache<Hash256>,
    ) -> Result<Option<Box<SignedBeaconBlock<T>>>, VerifyError> {
        let block = self.current_parent_request.verify_block(block)?;

--- a/beacon_node/network/src/sync/range_sync/range.rs
+++ b/beacon_node/network/src/sync/range_sync/range.rs
@@ -49,13 +49,18 @@ use crate::sync::manager::Id;
 use crate::sync::network_context::SyncNetworkContext;
 use crate::sync::BatchProcessResult;
 use beacon_chain::{BeaconChain, BeaconChainTypes};
+use lighthouse_network::rpc::GoodbyeReason;
 use lighthouse_network::PeerId;
 use lighthouse_network::SyncInfo;
-use slog::{crit, debug, error, trace};
+use lru_cache::LRUTimeCache;
+use slog::{crit, debug, error, trace, warn};
 use std::collections::HashMap;
 use std::sync::Arc;
 use tokio::sync::mpsc;
-use types::{Epoch, EthSpec, SignedBeaconBlock, Slot};
+use types::{Epoch, EthSpec, Hash256, SignedBeaconBlock, Slot};
+
+/// For how long we store failed finalized chains to prevent retries.
+const FAILED_CHAINS_EXPIRY_SECONDS: u64 = 30;

 /// The primary object dealing with long range/batch syncing. This contains all the active and
 /// non-active chains that need to be processed before the syncing is considered complete. This
@@ -69,6 +74,8 @@ pub struct RangeSync<T: BeaconChainTypes, C = BeaconChain<T>> {
    /// A collection of chains that need to be downloaded. This stores any head or finalized chains
    /// that need to be downloaded.
    chains: ChainCollection<T, C>,
+    /// Chains that have failed and are stored to prevent being retried.
+    failed_chains: LRUTimeCache<Hash256>,
    /// A multi-threaded, non-blocking processor for applying messages to the beacon chain.
    beacon_processor_send: mpsc::Sender<BeaconWorkEvent<T>>,
    /// The syncing logger.
@@ -88,6 +95,9 @@ where
        RangeSync {
            beacon_chain: beacon_chain.clone(),
            chains: ChainCollection::new(beacon_chain, log.clone()),
+            failed_chains: LRUTimeCache::new(std::time::Duration::from_secs(
+                FAILED_CHAINS_EXPIRY_SECONDS,
+            )),
            awaiting_head_peers: HashMap::new(),
            beacon_processor_send,
            log,
@@ -128,6 +138,14 @@ where
        // determine which kind of sync to perform and set up the chains
        match RangeSyncType::new(self.beacon_chain.as_ref(), &local_info, &remote_info) {
            RangeSyncType::Finalized => {
+                // Make sure we have not recently tried this chain
+                if self.failed_chains.contains(&remote_info.finalized_root) {
+                    debug!(self.log, "Disconnecting peer that belongs to previously failed chain";
+                        "failed_root" => %remote_info.finalized_root, "peer_id" => %peer_id);
+                    network.goodbye_peer(peer_id, GoodbyeReason::IrrelevantNetwork);
+                    return;
+                }
+
                // Finalized chain search
                debug!(self.log, "Finalization sync peer joined"; "peer_id" => %peer_id);
                self.awaiting_head_peers.remove(&peer_id);
@@ -338,6 +356,13 @@ where
            debug!(self.log, "Chain removed"; "sync_type" => ?sync_type, &chain, "reason" => ?remove_reason, "op" => op);
        }

+        if let RemoveChain::ChainFailed(_) = remove_reason {
+            if RangeSyncType::Finalized == sync_type {
+                warn!(self.log, "Chain failed! Syncing to its head won't be retried for at least the next {} seconds", FAILED_CHAINS_EXPIRY_SECONDS; &chain);
+                self.failed_chains.insert(chain.target_head_root);
+            }
+        }
+
        network.status_peers(self.beacon_chain.as_ref(), chain.peers());

        let local = match self.beacon_chain.status_message() {