Fix stuck data column lookups by improving peer selection and retry logic (#8005)

Fixes the issue described in #7980 where Lighthouse repeatedly sends `DataColumnsByRoot` requests to the same peers that return empty responses, causing sync to get stuck.

The root cause was we don't count empty responses as failures, leading to excessive retries to unresponsive peers.


  - Track per peer attempts to limit retry attempts per peer (`MAX_CUSTODY_PEER_ATTEMPTS = 3`)
- Replaced random peer selection with hashing within each lookup to prevent splitting lookup into too many small requests and improve request batching efficiency.
- Added `single_block_lookup` root span to track all lookups created and added more debug logs:

<img width="1264" height="501" alt="image" src="https://github.com/user-attachments/assets/983629ba-b6d0-41cf-8e93-88a5b96c2f31" />


Co-Authored-By: Jimmy Chen <jchen.tc@gmail.com>

Co-Authored-By: Jimmy Chen <jimmy@sigmaprime.io>
This commit is contained in:
Jimmy Chen
2025-09-09 16:18:05 +10:00
committed by GitHub
parent 8ec2640e04
commit ee734d1456
6 changed files with 117 additions and 67 deletions

View File

@@ -384,6 +384,7 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
// If we know that this lookup has unknown parent (is awaiting a parent lookup to resolve),
// signal here to hold processing downloaded data.
let mut lookup = SingleBlockLookup::new(block_root, peers, cx.next_id(), awaiting_parent);
let _guard = lookup.span.clone().entered();
// Add block components to the new request
if let Some(block_component) = block_component {

View File

@@ -7,6 +7,7 @@ use crate::sync::network_context::{
use beacon_chain::{BeaconChainTypes, BlockProcessStatus};
use derivative::Derivative;
use lighthouse_network::service::api_types::Id;
use lighthouse_tracing::SPAN_SINGLE_BLOCK_LOOKUP;
use parking_lot::RwLock;
use std::collections::HashSet;
use std::fmt::Debug;
@@ -14,6 +15,7 @@ use std::sync::Arc;
use std::time::{Duration, Instant};
use store::Hash256;
use strum::IntoStaticStr;
use tracing::{Span, debug_span};
use types::blob_sidecar::FixedBlobSidecarList;
use types::{DataColumnSidecarList, EthSpec, SignedBeaconBlock, Slot};
@@ -70,6 +72,7 @@ pub struct SingleBlockLookup<T: BeaconChainTypes> {
block_root: Hash256,
awaiting_parent: Option<Hash256>,
created: Instant,
pub(crate) span: Span,
}
#[derive(Debug)]
@@ -89,6 +92,12 @@ impl<T: BeaconChainTypes> SingleBlockLookup<T> {
id: Id,
awaiting_parent: Option<Hash256>,
) -> Self {
let lookup_span = debug_span!(
SPAN_SINGLE_BLOCK_LOOKUP,
block_root = %requested_block_root,
id = id,
);
Self {
id,
block_request_state: BlockRequestState::new(requested_block_root),
@@ -97,6 +106,7 @@ impl<T: BeaconChainTypes> SingleBlockLookup<T> {
block_root: requested_block_root,
awaiting_parent,
created: Instant::now(),
span: lookup_span,
}
}
@@ -192,6 +202,7 @@ impl<T: BeaconChainTypes> SingleBlockLookup<T> {
&mut self,
cx: &mut SyncNetworkContext<T>,
) -> Result<LookupResult, LookupRequestError> {
let _guard = self.span.clone().entered();
// TODO: Check what's necessary to download, specially for blobs
self.continue_request::<BlockRequestState<T::EthSpec>>(cx, 0)?;
@@ -257,6 +268,7 @@ impl<T: BeaconChainTypes> SingleBlockLookup<T> {
// that can make progress so it must be dropped. Consider the lookup completed.
// This case can happen if we receive the components from gossip during a retry.
if self.all_components_processed() {
self.span = Span::none();
Ok(LookupResult::Completed)
} else {
Ok(LookupResult::Pending)