mirror of
https://github.com/sigp/lighthouse.git
synced 2026-03-03 00:31:50 +00:00
@@ -257,9 +257,9 @@ lazy_static! {
|
||||
"sync_lookups_completed_total",
|
||||
"Total count of sync lookups completed",
|
||||
);
|
||||
pub static ref SYNC_LOOKUPS_STUCK: Result<IntGauge> = try_create_int_gauge(
|
||||
"sync_lookups_stuck",
|
||||
"Current count of sync lookups that may be stuck",
|
||||
pub static ref SYNC_LOOKUPS_STUCK: Result<IntCounter> = try_create_int_counter(
|
||||
"sync_lookups_stuck_total",
|
||||
"Total count of sync lookups that are stuck and dropped",
|
||||
);
|
||||
|
||||
/*
|
||||
|
||||
@@ -2,7 +2,7 @@ use crate::sync::block_lookups::single_block_lookup::{
|
||||
LookupRequestError, SingleBlockLookup, SingleLookupRequestState,
|
||||
};
|
||||
use crate::sync::block_lookups::{BlobRequestState, BlockRequestState, PeerId};
|
||||
use crate::sync::manager::{Id, SLOT_IMPORT_TOLERANCE};
|
||||
use crate::sync::manager::Id;
|
||||
use crate::sync::network_context::{LookupRequestResult, SyncNetworkContext};
|
||||
use beacon_chain::block_verification_types::RpcBlock;
|
||||
use beacon_chain::BeaconChainTypes;
|
||||
@@ -19,11 +19,6 @@ pub enum ResponseType {
|
||||
Blob,
|
||||
}
|
||||
|
||||
/// The maximum depth we will search for a parent block. In principle we should have sync'd any
|
||||
/// canonical chain to its head once the peer connects. A chain should not appear where it's depth
|
||||
/// is further back than the most recent head slot.
|
||||
pub(crate) const PARENT_DEPTH_TOLERANCE: usize = SLOT_IMPORT_TOLERANCE * 2;
|
||||
|
||||
/// This trait unifies common single block lookup functionality across blocks and blobs. This
|
||||
/// includes making requests, verifying responses, and handling processing results. A
|
||||
/// `SingleBlockLookup` includes both a `BlockRequestState` and a `BlobRequestState`, this trait is
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use self::parent_chain::{compute_parent_chains, NodeChain};
|
||||
pub use self::single_block_lookup::DownloadResult;
|
||||
use self::single_block_lookup::{LookupRequestError, LookupResult, SingleBlockLookup};
|
||||
use super::manager::{BlockProcessType, BlockProcessingResult};
|
||||
use super::manager::{BlockProcessType, BlockProcessingResult, SLOT_IMPORT_TOLERANCE};
|
||||
use super::network_context::{RpcResponseResult, SyncNetworkContext};
|
||||
use crate::metrics;
|
||||
use crate::sync::block_lookups::common::{ResponseType, PARENT_DEPTH_TOLERANCE};
|
||||
use crate::sync::block_lookups::common::ResponseType;
|
||||
use crate::sync::block_lookups::parent_chain::find_oldest_fork_ancestor;
|
||||
use crate::sync::manager::{Id, SingleLookupReqId};
|
||||
use beacon_chain::block_verification_types::AsBlock;
|
||||
@@ -28,9 +28,18 @@ mod single_block_lookup;
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
/// The maximum depth we will search for a parent block. In principle we should have sync'd any
|
||||
/// canonical chain to its head once the peer connects. A chain should not appear where it's depth
|
||||
/// is further back than the most recent head slot.
|
||||
pub(crate) const PARENT_DEPTH_TOLERANCE: usize = SLOT_IMPORT_TOLERANCE * 2;
|
||||
|
||||
const FAILED_CHAINS_CACHE_EXPIRY_SECONDS: u64 = 60;
|
||||
pub const SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS: u8 = 4;
|
||||
const LOOKUP_MAX_DURATION_SECS: u64 = 60;
|
||||
|
||||
/// Maximum time we allow a lookup to exist before assuming it is stuck and will never make
|
||||
/// progress. Assume the worse case processing time per block component set * times max depth.
|
||||
/// 15 * 2 * 32 = 16 minutes.
|
||||
const LOOKUP_MAX_DURATION_SECS: usize = 15 * PARENT_DEPTH_TOLERANCE;
|
||||
|
||||
pub enum BlockComponent<E: EthSpec> {
|
||||
Block(DownloadResult<Arc<SignedBeaconBlock<E>>>),
|
||||
@@ -680,20 +689,72 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
);
|
||||
}
|
||||
|
||||
pub fn log_stuck_lookups(&self) {
|
||||
let mut stuck_count = 0;
|
||||
for lookup in self.single_block_lookups.values() {
|
||||
if lookup.elapsed_since_created() > Duration::from_secs(LOOKUP_MAX_DURATION_SECS) {
|
||||
debug!(self.log, "Lookup maybe stuck";
|
||||
// Fields id and block_root are also part of the summary. However, logging them
|
||||
// here allows log parsers o index them and have better search
|
||||
"id" => lookup.id,
|
||||
"block_root" => ?lookup.block_root(),
|
||||
"summary" => ?lookup,
|
||||
/// Safety mechanism to unstuck lookup sync. Lookup sync if purely event driven and depends on
|
||||
/// external components to feed it events to make progress. If there is a bug in network, in
|
||||
/// beacon processor, or here internally: lookups can get stuck forever. A stuck lookup can
|
||||
/// stall a node indefinitely as other lookup will be awaiting on a parent lookup to make
|
||||
/// progress.
|
||||
///
|
||||
/// If a lookup lasts more than LOOKUP_MAX_DURATION_SECS this function will find its oldest
|
||||
/// ancestor and then drop it and all its children. This action will allow the node to unstuck
|
||||
/// itself. Bugs that cause lookups to get stuck may be triggered consistently. So this strategy
|
||||
/// is useful for two reasons:
|
||||
///
|
||||
/// - One single clear warn level log per stuck incident
|
||||
/// - If the original bug is sporadic, it reduces the time a node is stuck from forever to 15 min
|
||||
pub fn drop_stuck_lookups(&mut self) {
|
||||
// While loop to find and drop all disjoint trees of potentially stuck lookups.
|
||||
while let Some(stuck_lookup) = self.single_block_lookups.values().find(|lookup| {
|
||||
lookup.elapsed_since_created() > Duration::from_secs(LOOKUP_MAX_DURATION_SECS as u64)
|
||||
}) {
|
||||
let ancestor_stuck_lookup = match self.find_oldest_ancestor_lookup(stuck_lookup) {
|
||||
Ok(lookup) => lookup,
|
||||
Err(e) => {
|
||||
warn!(self.log, "Error finding oldest ancestor lookup"; "error" => ?e);
|
||||
// Default to dropping the lookup that exceeds the max duration so at least
|
||||
// eventually sync should be unstuck
|
||||
stuck_lookup
|
||||
}
|
||||
};
|
||||
|
||||
if stuck_lookup.id == ancestor_stuck_lookup.id {
|
||||
warn!(self.log, "Notify the devs, a sync lookup is stuck";
|
||||
"block_root" => ?stuck_lookup.block_root(),
|
||||
"lookup" => ?stuck_lookup,
|
||||
);
|
||||
} else {
|
||||
warn!(self.log, "Notify the devs, a sync lookup is stuck";
|
||||
"block_root" => ?stuck_lookup.block_root(),
|
||||
"lookup" => ?stuck_lookup,
|
||||
"ancestor_block_root" => ?ancestor_stuck_lookup.block_root(),
|
||||
"ancestor_lookup" => ?ancestor_stuck_lookup,
|
||||
);
|
||||
stuck_count += 1;
|
||||
}
|
||||
|
||||
metrics::inc_counter(&metrics::SYNC_LOOKUPS_STUCK);
|
||||
self.drop_lookup_and_children(ancestor_stuck_lookup.id);
|
||||
}
|
||||
}
|
||||
|
||||
/// Recursively find the oldest ancestor lookup of another lookup
|
||||
fn find_oldest_ancestor_lookup<'a>(
|
||||
&'a self,
|
||||
stuck_lookup: &'a SingleBlockLookup<T>,
|
||||
) -> Result<&'a SingleBlockLookup<T>, String> {
|
||||
if let Some(awaiting_parent) = stuck_lookup.awaiting_parent() {
|
||||
if let Some(lookup) = self
|
||||
.single_block_lookups
|
||||
.values()
|
||||
.find(|l| l.block_root() == awaiting_parent)
|
||||
{
|
||||
self.find_oldest_ancestor_lookup(lookup)
|
||||
} else {
|
||||
Err(format!(
|
||||
"Lookup references unknown parent {awaiting_parent:?}"
|
||||
))
|
||||
}
|
||||
} else {
|
||||
Ok(stuck_lookup)
|
||||
}
|
||||
metrics::set_gauge(&metrics::SYNC_LOOKUPS_STUCK, stuck_count);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@ use std::sync::Arc;
|
||||
|
||||
use super::*;
|
||||
|
||||
use crate::sync::block_lookups::common::{ResponseType, PARENT_DEPTH_TOLERANCE};
|
||||
use crate::sync::block_lookups::common::ResponseType;
|
||||
use beacon_chain::blob_verification::GossipVerifiedBlob;
|
||||
use beacon_chain::block_verification_types::{BlockImportData, RpcBlock};
|
||||
use beacon_chain::builder::Witness;
|
||||
|
||||
@@ -561,7 +561,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
self.handle_new_execution_engine_state(engine_state);
|
||||
}
|
||||
_ = interval.tick() => {
|
||||
self.block_lookups.log_stuck_lookups();
|
||||
self.block_lookups.drop_stuck_lookups();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user