From d457ceeaafaeb4c4818e950186be2eddf7e144e7 Mon Sep 17 00:00:00 2001 From: Lion - dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 5 Jun 2025 02:53:43 -0600 Subject: [PATCH] Don't create child lookup if parent is faulty (#7118) Issue discovered on PeerDAS devnet (node `lighthouse-geth-2.peerdas-devnet-5.ethpandaops.io`). Summary: - A lookup is created for block root `0x28299de15843970c8ea4f95f11f07f75e76a690f9a8af31d354c38505eebbe12` - That block or a parent is faulty and `0x28299de15843970c8ea4f95f11f07f75e76a690f9a8af31d354c38505eebbe12` is added to the failed chains cache - We later receive a block that is a child of a child of `0x28299de15843970c8ea4f95f11f07f75e76a690f9a8af31d354c38505eebbe12` - We create a lookup, which attempts to process the child of `0x28299de15843970c8ea4f95f11f07f75e76a690f9a8af31d354c38505eebbe12` and hit a processor error `UnknownParent`, hitting this line https://github.com/sigp/lighthouse/blob/bf955c7543dac8911a6f6c334b5b3ca4ef728d9c/beacon_node/network/src/sync/block_lookups/mod.rs#L686-L688 `search_parent_of_child` does not create a parent lookup because the parent root is in the failed chain cache. However, we have **already** marked the child as awaiting the parent. This results in an inconsistent state of lookup sync, as there's a lookup awaiting a parent that doesn't exist. Now we have a lookup (the child of `0x28299de15843970c8ea4f95f11f07f75e76a690f9a8af31d354c38505eebbe12`) that is awaiting a parent lookup that doesn't exist: hence stuck. ### Impact This bug can affect Mainnet as well as PeerDAS devnets. This bug may stall lookup sync for a few minutes (up to `LOOKUP_MAX_DURATION_STUCK_SECS = 15 min`) until the stuck prune routine deletes it. By that time the root will be cleared from the failed chain cache and sync should succeed. During that time the user will see a lot of `WARN` logs when attempting to add each peer to the inconsistent lookup. We may also sync the block through range sync if we fall behind by more than 2 epochs. We may also create the parent lookup successfully after the failed cache clears and complete the child lookup. This bug is triggered if: - We have a lookup that fails and its root is added to the failed chain cache (much more likely to happen in PeerDAS networks) - We receive a block that builds on a child of the block added to the failed chain cache Ensure that we never create (or leave existing) a lookup that references a non-existing parent. I added `must_use` lints to the functions that create lookups. To fix the specific bug we must recursively drop the child lookup if the parent is not created. So if `search_parent_of_child` returns `false` now return `LookupRequestError::Failed` instead of `LookupResult::Pending`. As a bonus I have a added more logging and reason strings to the errors --- .../network/src/sync/block_lookups/mod.rs | 62 +++++++++++++------ .../sync/block_lookups/single_block_lookup.rs | 2 +- beacon_node/network/src/sync/manager.rs | 23 +++++-- beacon_node/network/src/sync/tests/lookups.rs | 57 +++++++++++++++++ 4 files changed, 119 insertions(+), 25 deletions(-) diff --git a/beacon_node/network/src/sync/block_lookups/mod.rs b/beacon_node/network/src/sync/block_lookups/mod.rs index 8c884f644e..96b088747b 100644 --- a/beacon_node/network/src/sync/block_lookups/mod.rs +++ b/beacon_node/network/src/sync/block_lookups/mod.rs @@ -106,7 +106,7 @@ pub type SingleLookupId = u32; enum Action { Retry, ParentUnknown { parent_root: Hash256 }, - Drop, + Drop(/* reason: */ String), Continue, } @@ -194,19 +194,22 @@ impl BlockLookups { /// Creates a parent lookup for the block with the given `block_root` and immediately triggers it. /// If a parent lookup exists or is triggered, a current lookup will be created. + /// + /// Returns true if the lookup is created or already exists #[instrument(parent = None, level = "info", fields(service = "lookup_sync"), name = "lookup_sync", skip_all )] + #[must_use = "only reference the new lookup if returns true"] pub fn search_child_and_parent( &mut self, block_root: Hash256, block_component: BlockComponent, peer_id: PeerId, cx: &mut SyncNetworkContext, - ) { + ) -> bool { let parent_root = block_component.parent_root(); let parent_lookup_exists = @@ -223,11 +226,14 @@ impl BlockLookups { // the lookup with zero peers to house the block components. &[], cx, - ); + ) + } else { + false } } /// Seach a block whose parent root is unknown. + /// /// Returns true if the lookup is created or already exists #[instrument(parent = None, level = "info", @@ -235,13 +241,14 @@ impl BlockLookups { name = "lookup_sync", skip_all )] + #[must_use = "only reference the new lookup if returns true"] pub fn search_unknown_block( &mut self, block_root: Hash256, peer_source: &[PeerId], cx: &mut SyncNetworkContext, - ) { - self.new_current_lookup(block_root, None, None, peer_source, cx); + ) -> bool { + self.new_current_lookup(block_root, None, None, peer_source, cx) } /// A block or blob triggers the search of a parent. @@ -256,6 +263,7 @@ impl BlockLookups { name = "lookup_sync", skip_all )] + #[must_use = "only reference the new lookup if returns true"] pub fn search_parent_of_child( &mut self, block_root_to_search: Hash256, @@ -363,6 +371,7 @@ impl BlockLookups { name = "lookup_sync", skip_all )] + #[must_use = "only reference the new lookup if returns true"] fn new_current_lookup( &mut self, block_root: Hash256, @@ -656,7 +665,7 @@ impl BlockLookups { // This is unreachable because RPC blocks do not undergo gossip verification, and // this error can *only* come from gossip verification. error!(?block_root, "Single block lookup hit unreachable condition"); - Action::Drop + Action::Drop("DuplicateImportStatusUnknown".to_owned()) } BlockProcessingResult::Ignored => { // Beacon processor signalled to ignore the block processing result. @@ -665,14 +674,14 @@ impl BlockLookups { component = ?R::response_type(), "Lookup component processing ignored, cpu might be overloaded" ); - Action::Drop + Action::Drop("Block processing ignored".to_owned()) } BlockProcessingResult::Err(e) => { match e { BlockError::BeaconChainError(e) => { // Internal error error!(%block_root, error = ?e, "Beacon chain error processing lookup component"); - Action::Drop + Action::Drop(format!("{e:?}")) } BlockError::ParentUnknown { parent_root, .. } => { // Reverts the status of this request to `AwaitingProcessing` holding the @@ -691,7 +700,7 @@ impl BlockLookups { error = ?e, "Single block lookup failed. Execution layer is offline / unsynced / misconfigured" ); - Action::Drop + Action::Drop(format!("{e:?}")) } BlockError::AvailabilityCheck(e) if e.category() == AvailabilityCheckErrorCategory::Internal => @@ -703,7 +712,7 @@ impl BlockLookups { // lookup state transition. This error invalidates both blob and block requests, and we don't know the // stateĀ of both requests. Blobs may have already successfullly processed for example. // We opt to drop the lookup instead. - Action::Drop + Action::Drop(format!("{e:?}")) } other => { debug!( @@ -757,19 +766,32 @@ impl BlockLookups { } Action::ParentUnknown { parent_root } => { let peers = lookup.all_peers(); + // Mark lookup as awaiting **before** creating the parent lookup. At this point the + // lookup maybe inconsistent. lookup.set_awaiting_parent(parent_root); - debug!( - id = lookup.id, - ?block_root, - ?parent_root, - "Marking lookup as awaiting parent" - ); - self.search_parent_of_child(parent_root, block_root, &peers, cx); - Ok(LookupResult::Pending) + let parent_lookup_exists = + self.search_parent_of_child(parent_root, block_root, &peers, cx); + if parent_lookup_exists { + // The parent lookup exist or has been created. It's safe for `lookup` to + // reference the parent as awaiting. + debug!( + id = lookup_id, + ?block_root, + ?parent_root, + "Marking lookup as awaiting parent" + ); + Ok(LookupResult::Pending) + } else { + // The parent lookup is faulty and was not created, we must drop the `lookup` as + // it's in an inconsistent state. We must drop all of its children too. + Err(LookupRequestError::Failed(format!( + "Parent lookup is faulty {parent_root:?}" + ))) + } } - Action::Drop => { + Action::Drop(reason) => { // Drop with noop - Err(LookupRequestError::Failed) + Err(LookupRequestError::Failed(reason)) } Action::Continue => { // Drop this completed lookup only diff --git a/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs b/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs index 3789dbe91e..30947cf1f0 100644 --- a/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs +++ b/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs @@ -40,7 +40,7 @@ pub enum LookupRequestError { /// Inconsistent lookup request state BadState(String), /// Lookup failed for some other reason and should be dropped - Failed, + Failed(/* reason: */ String), /// Received MissingComponents when all components have been processed. This should never /// happen, and indicates some internal bug MissingComponentsAfterAllProcessed, diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 473881f182..b278d47de3 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -929,12 +929,20 @@ impl SyncManager { ) { match self.should_search_for_block(Some(slot), &peer_id) { Ok(_) => { - self.block_lookups.search_child_and_parent( + if self.block_lookups.search_child_and_parent( block_root, block_component, peer_id, &mut self.network, - ); + ) { + // Lookup created. No need to log here it's logged in `new_current_lookup` + } else { + debug!( + ?block_root, + ?parent_root, + "No lookup created for child and parent" + ); + } } Err(reason) => { debug!(%block_root, %parent_root, reason, "Ignoring unknown parent request"); @@ -945,8 +953,15 @@ impl SyncManager { fn handle_unknown_block_root(&mut self, peer_id: PeerId, block_root: Hash256) { match self.should_search_for_block(None, &peer_id) { Ok(_) => { - self.block_lookups - .search_unknown_block(block_root, &[peer_id], &mut self.network); + if self.block_lookups.search_unknown_block( + block_root, + &[peer_id], + &mut self.network, + ) { + // Lookup created. No need to log here it's logged in `new_current_lookup` + } else { + debug!(?block_root, "No lookup created for unknown block"); + } } Err(reason) => { debug!(%block_root, reason, "Ignoring unknown block request"); diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index 84ff1c7e25..e7e6ff5970 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -1725,6 +1725,63 @@ fn test_parent_lookup_too_deep_grow_ancestor() { rig.assert_failed_chain(chain_hash); } +// Regression test for https://github.com/sigp/lighthouse/pull/7118 +#[test] +fn test_child_lookup_not_created_for_failed_chain_parent_after_processing() { + // GIVEN: A parent chain longer than PARENT_DEPTH_TOLERANCE. + let mut rig = TestRig::test_setup(); + let mut blocks = rig.rand_blockchain(PARENT_DEPTH_TOLERANCE + 1); + let peer_id = rig.new_connected_peer(); + + // The child of the trigger block to be used to extend the chain. + let trigger_block_child = blocks.pop().unwrap(); + // The trigger block that starts the lookup. + let trigger_block = blocks.pop().unwrap(); + let tip_root = trigger_block.canonical_root(); + + // Trigger the initial unknown parent block for the tip. + rig.trigger_unknown_parent_block(peer_id, trigger_block.clone()); + + // Simulate the lookup chain building up via `ParentUnknown` errors. + for block in blocks.into_iter().rev() { + let id = rig.expect_block_parent_request(block.canonical_root()); + rig.parent_lookup_block_response(id, peer_id, Some(block.clone())); + rig.parent_lookup_block_response(id, peer_id, None); + rig.expect_block_process(ResponseType::Block); + rig.parent_block_processed( + tip_root, + BlockProcessingResult::Err(BlockError::ParentUnknown { + parent_root: block.parent_root(), + }), + ); + } + + // At this point, the chain should have been deemed too deep and pruned. + // The tip root should have been inserted into failed chains. + rig.assert_failed_chain(tip_root); + rig.expect_no_penalty_for(peer_id); + + // WHEN: Trigger the extending block that points to the tip. + let trigger_block_child_root = trigger_block_child.canonical_root(); + rig.trigger_unknown_block_from_attestation(trigger_block_child_root, peer_id); + let id = rig.expect_block_lookup_request(trigger_block_child_root); + rig.single_lookup_block_response(id, peer_id, Some(trigger_block_child.clone())); + rig.single_lookup_block_response(id, peer_id, None); + rig.expect_block_process(ResponseType::Block); + rig.single_block_component_processed( + id.lookup_id, + BlockProcessingResult::Err(BlockError::ParentUnknown { + parent_root: tip_root, + }), + ); + + // THEN: The extending block should not create a lookup because the tip was inserted into failed chains. + rig.expect_no_active_lookups(); + // AND: The peer should be penalized for extending a failed chain. + rig.expect_single_penalty(peer_id, "failed_chain"); + rig.expect_empty_network(); +} + #[test] fn test_parent_lookup_too_deep_grow_tip() { let mut rig = TestRig::test_setup();