From d7d56e6312e3e4d2fe6e3988853dd08f21c6a888 Mon Sep 17 00:00:00 2001 From: Lion - dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 2 Jun 2026 16:57:03 +0200 Subject: [PATCH 1/4] Delete unnecessary SyncMessage variants (#9379) - Simplification from https://github.com/sigp/lighthouse/pull/9155 Lookup sync does not cache sidecars, so sending the full network object adds unnecessary complexity. Sync only needs to know: We have received a header that has an unknown parent. Replace `UnknownParentDataColumn` and `UnknownParentPartialDataColumn` for `UnknownParentSidecarHeader` Co-Authored-By: dapplion <35266934+dapplion@users.noreply.github.com> Co-Authored-By: Eitan Seri-Levi --- .../gossip_methods.rs | 12 +++-- .../network/src/sync/block_lookups/mod.rs | 13 ++--- .../sync/block_lookups/single_block_lookup.rs | 2 +- beacon_node/network/src/sync/manager.rs | 53 +++---------------- beacon_node/network/src/sync/tests/lookups.rs | 13 ++++- 5 files changed, 31 insertions(+), 62 deletions(-) diff --git a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs index 65c95eff35..df94b473a8 100644 --- a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs @@ -719,17 +719,19 @@ impl NetworkBeaconProcessor { MessageAcceptance::Accept, ); } - GossipDataColumnError::ParentUnknown { parent_root, .. } => { + GossipDataColumnError::ParentUnknown { parent_root, slot } => { debug!( action = "requesting parent", %block_root, %parent_root, "Unknown parent hash for column" ); - self.send_sync_message(SyncMessage::UnknownParentDataColumn( + self.send_sync_message(SyncMessage::UnknownParentSidecarHeader { peer_id, - column_sidecar, - )); + block_root, + parent_root, + slot, + }); } GossipDataColumnError::BlockRootUnknown { block_root: unknown_block_root, @@ -1047,7 +1049,7 @@ impl NetworkBeaconProcessor { %parent_root, "Unknown parent hash for partial column" ); - self.send_sync_message(SyncMessage::UnknownParentPartialDataColumn { + self.send_sync_message(SyncMessage::UnknownParentSidecarHeader { peer_id, block_root, parent_root, diff --git a/beacon_node/network/src/sync/block_lookups/mod.rs b/beacon_node/network/src/sync/block_lookups/mod.rs index ecaee7c0ec..0c3cb988a9 100644 --- a/beacon_node/network/src/sync/block_lookups/mod.rs +++ b/beacon_node/network/src/sync/block_lookups/mod.rs @@ -74,26 +74,23 @@ const LOOKUP_MAX_DURATION_NO_PEERS_SECS: u64 = 10; /// take at most 2 GB. 200 lookups allow 3 parallel chains of depth 64 (current maximum). const MAX_LOOKUPS: usize = 200; -/// The values for `Blob`, `DataColumn` and `PartialDataColumn` is the parent root of the column. +/// The value for `Sidecar` is the parent root of the sidecar. pub enum BlockComponent { Block(DownloadResult>>), - DataColumn(DownloadResult), - PartialDataColumn(DownloadResult), + Sidecar { parent_root: Hash256 }, } impl BlockComponent { fn parent_root(&self) -> Hash256 { match self { BlockComponent::Block(block) => block.value.parent_root(), - BlockComponent::DataColumn(parent_root) - | BlockComponent::PartialDataColumn(parent_root) => parent_root.value, + BlockComponent::Sidecar { parent_root } => *parent_root, } } fn get_type(&self) -> &'static str { match self { BlockComponent::Block(_) => "block", - BlockComponent::DataColumn(_) => "data_column", - BlockComponent::PartialDataColumn(_) => "partial_data_column", + BlockComponent::Sidecar { .. } => "sidecar", } } } @@ -207,7 +204,7 @@ impl BlockLookups { block_root, Some(block_component), Some(parent_root), - // On a `UnknownParentBlock` or `UnknownParentDataColumn` event the peer is not + // On a `UnknownParentBlock` or `UnknownParentSidecarHeader` event the peer is not // required to have the rest of the block components. Create the lookup with zero // peers to house the block components. &[], diff --git a/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs b/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs index dda58023be..38d6b2216d 100644 --- a/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs +++ b/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs @@ -151,7 +151,7 @@ impl SingleBlockLookup { .block_request_state .state .insert_verified_response(block), - BlockComponent::DataColumn(_) | BlockComponent::PartialDataColumn(_) => { + BlockComponent::Sidecar { .. } => { // For now ignore single blobs and columns, as the blob request state assumes all blobs are // attributed to the same peer = the peer serving the remaining blobs. Ignoring this // block component has a minor effect, causing the node to re-request this blob diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index ecbe6227cc..37d13c2eae 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -144,11 +144,9 @@ pub enum SyncMessage { /// A block with an unknown parent has been received. UnknownParentBlock(PeerId, Arc>, Hash256), - /// A data column with an unknown parent has been received. - UnknownParentDataColumn(PeerId, Arc>), - - /// A partial data column with an unknown parent has been received. - UnknownParentPartialDataColumn { + /// A sidecar (full/partial data column) with an unknown parent has been received. Carries only the header + /// info needed to trigger a parent lookup, decoupled from the concrete sidecar type. + UnknownParentSidecarHeader { peer_id: PeerId, block_root: Hash256, parent_root: Hash256, @@ -875,58 +873,19 @@ impl SyncManager { }), ); } - SyncMessage::UnknownParentDataColumn(peer_id, data_column) => { - let data_column_slot = data_column.slot(); - let block_root = data_column.block_root(); - match data_column.as_ref() { - DataColumnSidecar::Fulu(column) => { - let parent_root = column.block_parent_root(); - debug!(%block_root, %parent_root, "Received unknown parent data column message"); - self.handle_unknown_parent( - peer_id, - block_root, - parent_root, - data_column_slot, - BlockComponent::DataColumn(DownloadResult { - value: parent_root, - block_root, - seen_timestamp: self - .chain - .slot_clock - .now_duration() - .unwrap_or_default(), - peer_group: PeerGroup::from_single(peer_id), - }), - ); - } - DataColumnSidecar::Gloas(_) => { - // TODO(gloas): proper lookup sync for Gloas. Routing into - // `handle_unknown_block_root` here mixes column processing with the - // single-block-lookup path; the Gloas column-arrives-before-block - // case wants its own queue/wakeup. - debug!(%block_root, "Received unknown block data column message"); - self.handle_unknown_block_root(peer_id, block_root); - } - } - } - SyncMessage::UnknownParentPartialDataColumn { + SyncMessage::UnknownParentSidecarHeader { peer_id, block_root, parent_root, slot, } => { - debug!(%block_root, %parent_root, "Received unknown parent partial column message"); + debug!(%block_root, %parent_root, "Received unknown parent sidecar header message"); self.handle_unknown_parent( peer_id, block_root, parent_root, slot, - BlockComponent::PartialDataColumn(DownloadResult { - value: parent_root, - block_root, - seen_timestamp: self.chain.slot_clock.now_duration().unwrap_or_default(), - peer_group: PeerGroup::from_single(peer_id), - }), + BlockComponent::Sidecar { parent_root }, ); } SyncMessage::UnknownBlockHashFromAttestation(peer_id, block_root) => { diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index 1a60e4f243..3ec4d11da2 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -1365,7 +1365,18 @@ impl TestRig { peer_id: PeerId, data_column: Arc>, ) { - self.send_sync_message(SyncMessage::UnknownParentDataColumn(peer_id, data_column)); + let block_root = data_column.block_root(); + let slot = data_column.slot(); + let parent_root = match data_column.as_ref() { + DataColumnSidecar::Fulu(column) => column.block_parent_root(), + DataColumnSidecar::Gloas(_) => panic!("Gloas data column not supported in this test"), + }; + self.send_sync_message(SyncMessage::UnknownParentSidecarHeader { + peer_id, + block_root, + parent_root, + slot, + }); } fn trigger_unknown_block_from_attestation(&mut self, block_root: Hash256, peer_id: PeerId) { From c2ac519c69311527a23ea161b7bde5117755506e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Oliveira?= Date: Wed, 3 Jun 2026 09:05:31 +0100 Subject: [PATCH 2/4] Disable Mplex by default (#9365) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: João Oliveira --- beacon_node/lighthouse_network/src/config.rs | 4 ++ .../lighthouse_network/src/service/mod.rs | 10 +++-- .../lighthouse_network/src/service/utils.rs | 40 ++++++++++++------- beacon_node/src/cli.rs | 8 ++++ beacon_node/src/config.rs | 4 ++ book/src/help_bn.md | 3 ++ 6 files changed, 52 insertions(+), 17 deletions(-) diff --git a/beacon_node/lighthouse_network/src/config.rs b/beacon_node/lighthouse_network/src/config.rs index db42d0cfa8..4d4d91a456 100644 --- a/beacon_node/lighthouse_network/src/config.rs +++ b/beacon_node/lighthouse_network/src/config.rs @@ -125,6 +125,9 @@ pub struct Config { /// Whether light client protocols should be enabled. pub enable_light_client_server: bool, + /// Whether to enable the deprecated mplex multiplexer alongside yamux. + pub enable_mplex: bool, + /// Configuration for the outbound rate limiter (requests made by this node). pub outbound_rate_limiter_config: Option, @@ -362,6 +365,7 @@ impl Default for Config { proposer_only: false, metrics_enabled: false, enable_light_client_server: true, + enable_mplex: false, outbound_rate_limiter_config: None, invalid_block_storage: None, inbound_rate_limiter_config: None, diff --git a/beacon_node/lighthouse_network/src/service/mod.rs b/beacon_node/lighthouse_network/src/service/mod.rs index 41d937e324..f5e2442f86 100644 --- a/beacon_node/lighthouse_network/src/service/mod.rs +++ b/beacon_node/lighthouse_network/src/service/mod.rs @@ -466,9 +466,13 @@ impl Network { } }; - // Set up the transport - tcp/quic with noise and mplex - let transport = build_transport(local_keypair.clone(), !config.disable_quic_support) - .map_err(|e| format!("Failed to build transport: {:?}", e))?; + // Set up the transport - tcp/quic with noise and yamux (mplex optional) + let transport = build_transport( + local_keypair.clone(), + !config.disable_quic_support, + config.enable_mplex, + ) + .map_err(|e| format!("Failed to build transport: {:?}", e))?; // use the executor for libp2p struct Executor(task_executor::TaskExecutor); diff --git a/beacon_node/lighthouse_network/src/service/utils.rs b/beacon_node/lighthouse_network/src/service/utils.rs index c7dabcb391..47629f4fd3 100644 --- a/beacon_node/lighthouse_network/src/service/utils.rs +++ b/beacon_node/lighthouse_network/src/service/utils.rs @@ -34,27 +34,39 @@ pub struct Context<'a> { type BoxedTransport = Boxed<(PeerId, StreamMuxerBox)>; /// The implementation supports TCP/IP, QUIC (experimental) over UDP, noise as the encryption layer, and -/// mplex/yamux as the multiplexing layer (when using TCP). +/// yamux as the multiplexing layer (when using TCP). Mplex can be optionally enabled. pub fn build_transport( local_private_key: Keypair, quic_support: bool, + enable_mplex: bool, ) -> std::io::Result { - // mplex config - let mut mplex_config = libp2p_mplex::Config::new(); - mplex_config.set_max_buffer_size(256); - mplex_config.set_max_buffer_behaviour(libp2p_mplex::MaxBufferBehaviour::Block); - // yamux config let yamux_config = yamux::Config::default(); + // Creates the TCP transport layer - let tcp = libp2p::tcp::tokio::Transport::new(libp2p::tcp::Config::default().nodelay(true)) - .upgrade(core::upgrade::Version::V1) - .authenticate(generate_noise_config(&local_private_key)) - .multiplex(core::upgrade::SelectUpgrade::new( - yamux_config, - mplex_config, - )) - .timeout(Duration::from_secs(10)); + let tcp: BoxedTransport = if enable_mplex { + // Enable both yamux and mplex. + let mut mplex_config = libp2p_mplex::Config::new(); + mplex_config.set_max_num_streams(32); + mplex_config.set_max_buffer_behaviour(libp2p_mplex::MaxBufferBehaviour::ResetStream); + libp2p::tcp::tokio::Transport::new(libp2p::tcp::Config::default().nodelay(true)) + .upgrade(core::upgrade::Version::V1) + .authenticate(generate_noise_config(&local_private_key)) + .multiplex(core::upgrade::SelectUpgrade::new( + yamux_config, + mplex_config, + )) + .timeout(Duration::from_secs(10)) + .boxed() + } else { + // Yamux only + libp2p::tcp::tokio::Transport::new(libp2p::tcp::Config::default().nodelay(true)) + .upgrade(core::upgrade::Version::V1) + .authenticate(generate_noise_config(&local_private_key)) + .multiplex(yamux_config) + .timeout(Duration::from_secs(10)) + .boxed() + }; let transport = if quic_support { // Enables Quic // The default quic configuration suits us for now. diff --git a/beacon_node/src/cli.rs b/beacon_node/src/cli.rs index 647b5858cb..988e2d1fc5 100644 --- a/beacon_node/src/cli.rs +++ b/beacon_node/src/cli.rs @@ -387,6 +387,14 @@ pub fn cli_app() -> Command { .help("Disables the quic transport. The node will rely solely on the TCP transport for libp2p connections.") .display_order(0) ) + .arg( + Arg::new("enable-mplex") + .long("enable-mplex") + .action(ArgAction::SetTrue) + .help_heading(FLAG_HEADER) + .help("Enables mplex multiplexer alongside yamux. Yamux is preferred when both are available.") + .display_order(0) + ) .arg( Arg::new("disable-peer-scoring") .long("disable-peer-scoring") diff --git a/beacon_node/src/config.rs b/beacon_node/src/config.rs index 045b432dc9..ddf8d07c4e 100644 --- a/beacon_node/src/config.rs +++ b/beacon_node/src/config.rs @@ -1443,6 +1443,10 @@ pub fn set_network_config( config.disable_quic_support = true; } + if parse_flag(cli_args, "enable-mplex") { + config.enable_mplex = true; + } + if parse_flag(cli_args, "disable-upnp") { config.upnp_enabled = false; } diff --git a/book/src/help_bn.md b/book/src/help_bn.md index 30163f1f0c..1f57db1b59 100644 --- a/book/src/help_bn.md +++ b/book/src/help_bn.md @@ -494,6 +494,9 @@ Flags: Sets the local ENR IP address and port to match those set for lighthouse. Specifically, the IP address will be the value of --listen-address and the UDP port will be --discovery-port. + --enable-mplex + Enables mplex multiplexer alongside yamux. Yamux is preferred when + both are available. --enable-partial-columns Enable partial messages for data columns. This can reduce the amount of data sent over the network. Enabled by default on Hoodi and From eab5163d68f2c1e88b43627956dcd496a321ab41 Mon Sep 17 00:00:00 2001 From: Lion - dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 3 Jun 2026 17:29:04 +0200 Subject: [PATCH 3/4] Remove RequestState trait from lookup sync (#9391) Co-Authored-By: dapplion <35266934+dapplion@users.noreply.github.com> --- .../network/src/sync/block_lookups/common.rs | 164 ------ .../network/src/sync/block_lookups/mod.rs | 275 ++-------- .../sync/block_lookups/single_block_lookup.rs | 510 ++++++++++-------- beacon_node/network/src/sync/manager.rs | 29 +- .../network/src/sync/network_context.rs | 29 +- .../src/sync/network_context/custody.rs | 12 +- 6 files changed, 369 insertions(+), 650 deletions(-) delete mode 100644 beacon_node/network/src/sync/block_lookups/common.rs diff --git a/beacon_node/network/src/sync/block_lookups/common.rs b/beacon_node/network/src/sync/block_lookups/common.rs deleted file mode 100644 index 4306458615..0000000000 --- a/beacon_node/network/src/sync/block_lookups/common.rs +++ /dev/null @@ -1,164 +0,0 @@ -use crate::sync::block_lookups::single_block_lookup::{ - LookupRequestError, SingleBlockLookup, SingleLookupRequestState, -}; -use crate::sync::block_lookups::{BlockRequestState, CustodyRequestState, PeerId}; -use crate::sync::manager::BlockProcessType; -use crate::sync::network_context::{LookupRequestResult, SyncNetworkContext}; -use beacon_chain::BeaconChainTypes; -use lighthouse_network::service::api_types::Id; -use parking_lot::RwLock; -use std::collections::HashSet; -use std::sync::Arc; -use types::{DataColumnSidecarList, SignedBeaconBlock}; - -use super::SingleLookupId; -use super::single_block_lookup::{ComponentRequests, DownloadResult}; - -#[derive(Debug, Copy, Clone)] -pub enum ResponseType { - Block, - CustodyColumn, -} - -/// This trait unifies common single block lookup functionality across blocks and data columns. -/// This includes making requests, verifying responses, and handling processing results. A -/// `SingleBlockLookup` includes both a `BlockRequestState` and a `CustodyRequestState`, this trait -/// is implemented for each. -/// -/// The use of the `ResponseType` associated type gives us a degree of type -/// safety when handling a block/column response ensuring we only mutate the correct corresponding -/// state. -pub trait RequestState { - /// The type created after validation. - type VerifiedResponseType: Clone; - - /// Request the network context to prepare a request of a component of `block_root`. If the - /// request is not necessary because the component is already known / processed, return false. - /// Return true if it sent a request and we can expect an event back from the network. - fn make_request( - &self, - id: Id, - lookup_peers: Arc>>, - expected_blobs: usize, - cx: &mut SyncNetworkContext, - ) -> Result; - - /* Response handling methods */ - - /// Send the response to the beacon processor. - fn send_for_processing( - id: Id, - result: DownloadResult, - cx: &SyncNetworkContext, - ) -> Result<(), LookupRequestError>; - - /* Utility methods */ - - /// Returns the `ResponseType` associated with this trait implementation. Useful in logging. - fn response_type() -> ResponseType; - - /// A getter for the `BlockRequestState` or `CustodyRequestState` associated with this trait. - fn request_state_mut(request: &mut SingleBlockLookup) -> Result<&mut Self, &'static str>; - - /// A getter for a reference to the `SingleLookupRequestState` associated with this trait. - fn get_state(&self) -> &SingleLookupRequestState; - - /// A getter for a mutable reference to the SingleLookupRequestState associated with this trait. - fn get_state_mut(&mut self) -> &mut SingleLookupRequestState; -} - -impl RequestState for BlockRequestState { - type VerifiedResponseType = Arc>; - - fn make_request( - &self, - id: SingleLookupId, - lookup_peers: Arc>>, - _: usize, - cx: &mut SyncNetworkContext, - ) -> Result { - cx.block_lookup_request(id, lookup_peers, self.requested_block_root) - .map_err(LookupRequestError::SendFailedNetwork) - } - - fn send_for_processing( - id: SingleLookupId, - download_result: DownloadResult, - cx: &SyncNetworkContext, - ) -> Result<(), LookupRequestError> { - let DownloadResult { - value, - block_root, - seen_timestamp, - .. - } = download_result; - cx.send_block_for_processing(id, block_root, value, seen_timestamp) - .map_err(LookupRequestError::SendFailedProcessor) - } - - fn response_type() -> ResponseType { - ResponseType::Block - } - fn request_state_mut(request: &mut SingleBlockLookup) -> Result<&mut Self, &'static str> { - Ok(&mut request.block_request_state) - } - fn get_state(&self) -> &SingleLookupRequestState { - &self.state - } - fn get_state_mut(&mut self) -> &mut SingleLookupRequestState { - &mut self.state - } -} - -impl RequestState for CustodyRequestState { - type VerifiedResponseType = DataColumnSidecarList; - - fn make_request( - &self, - id: Id, - lookup_peers: Arc>>, - _: usize, - cx: &mut SyncNetworkContext, - ) -> Result { - cx.custody_lookup_request(id, self.block_root, self.slot, lookup_peers) - .map_err(LookupRequestError::SendFailedNetwork) - } - - fn send_for_processing( - id: Id, - download_result: DownloadResult, - cx: &SyncNetworkContext, - ) -> Result<(), LookupRequestError> { - let DownloadResult { - value, - block_root, - seen_timestamp, - .. - } = download_result; - cx.send_custody_columns_for_processing( - id, - block_root, - value, - seen_timestamp, - BlockProcessType::SingleCustodyColumn(id), - ) - .map_err(LookupRequestError::SendFailedProcessor) - } - - fn response_type() -> ResponseType { - ResponseType::CustodyColumn - } - fn request_state_mut(request: &mut SingleBlockLookup) -> Result<&mut Self, &'static str> { - match &mut request.component_requests { - ComponentRequests::WaitingForBlock => Err("waiting for block"), - ComponentRequests::ActiveCustodyRequest(request) => Ok(request), - ComponentRequests::NotNeeded { .. } => Err("not needed"), - } - } - fn get_state(&self) -> &SingleLookupRequestState { - &self.state - } - fn get_state_mut(&mut self) -> &mut SingleLookupRequestState { - &mut self.state - } -} diff --git a/beacon_node/network/src/sync/block_lookups/mod.rs b/beacon_node/network/src/sync/block_lookups/mod.rs index 0c3cb988a9..a265373e3f 100644 --- a/beacon_node/network/src/sync/block_lookups/mod.rs +++ b/beacon_node/network/src/sync/block_lookups/mod.rs @@ -24,27 +24,23 @@ use self::parent_chain::{NodeChain, compute_parent_chains}; pub use self::single_block_lookup::DownloadResult; use self::single_block_lookup::{LookupRequestError, LookupResult, SingleBlockLookup}; use super::manager::{BlockProcessType, SLOT_IMPORT_TOLERANCE}; -use super::network_context::{PeerGroup, RpcResponseError, SyncNetworkContext}; +use super::network_context::{RpcResponseError, SyncNetworkContext}; use crate::metrics; use crate::network_beacon_processor::BlockProcessingResult; use crate::sync::SyncMessage; use crate::sync::block_lookups::parent_chain::find_oldest_fork_ancestor; use beacon_chain::BeaconChainTypes; -use beacon_chain::block_verification_types::AsBlock; -pub use common::RequestState; use fnv::FnvHashMap; use lighthouse_network::PeerId; use lighthouse_network::service::api_types::SingleLookupReqId; use lru_cache::LRUTimeCache; -pub use single_block_lookup::{BlockRequestState, CustodyRequestState}; use std::collections::hash_map::Entry; use std::sync::Arc; use std::time::Duration; use store::Hash256; use tracing::{debug, error, warn}; -use types::{EthSpec, SignedBeaconBlock}; +use types::{DataColumnSidecarList, EthSpec, SignedBeaconBlock}; -pub mod common; pub mod parent_chain; mod single_block_lookup; @@ -74,35 +70,17 @@ const LOOKUP_MAX_DURATION_NO_PEERS_SECS: u64 = 10; /// take at most 2 GB. 200 lookups allow 3 parallel chains of depth 64 (current maximum). const MAX_LOOKUPS: usize = 200; -/// The value for `Sidecar` is the parent root of the sidecar. +type BlockDownloadResponse = Result>>, RpcResponseError>; +type CustodyDownloadResponse = + Result>, RpcResponseError>; + pub enum BlockComponent { Block(DownloadResult>>), - Sidecar { parent_root: Hash256 }, -} - -impl BlockComponent { - fn parent_root(&self) -> Hash256 { - match self { - BlockComponent::Block(block) => block.value.parent_root(), - BlockComponent::Sidecar { parent_root } => *parent_root, - } - } - fn get_type(&self) -> &'static str { - match self { - BlockComponent::Block(_) => "block", - BlockComponent::Sidecar { .. } => "sidecar", - } - } + Sidecar, } pub type SingleLookupId = u32; -enum Action { - Retry, - ParentUnknown { parent_root: Hash256 }, - Continue, -} - pub struct BlockLookups { /// A cache of block roots that must be ignored for some time to prevent useless searches. For /// example if a chain is too long, its lookup chain is dropped, and range sync is expected to @@ -190,11 +168,10 @@ impl BlockLookups { &mut self, block_root: Hash256, block_component: BlockComponent, + parent_root: Hash256, peer_id: PeerId, cx: &mut SyncNetworkContext, ) -> bool { - let parent_root = block_component.parent_root(); - let parent_lookup_exists = self.search_parent_of_child(parent_root, block_root, &[peer_id], cx); // Only create the child lookup if the parent exists @@ -215,7 +192,7 @@ impl BlockLookups { } } - /// Seach a block whose parent root is unknown. + /// Search a block whose parent root is unknown. /// /// Returns true if the lookup is created or already exists #[must_use = "only reference the new lookup if returns true"] @@ -358,13 +335,9 @@ impl BlockLookups { .find(|(_id, lookup)| lookup.is_for_block(block_root)) { if let Some(block_component) = block_component { - let component_type = block_component.get_type(); let imported = lookup.add_child_components(block_component); if !imported { - debug!( - ?block_root, - component_type, "Lookup child component ignored" - ); + debug!(?block_root, "Lookup child component ignored"); } } @@ -436,88 +409,33 @@ impl BlockLookups { /* Lookup responses */ - /// Process a block or blob response received from a single lookup request. - pub fn on_download_response>( + /// Process a block response received from a single lookup request. + pub fn on_block_download_response( &mut self, id: SingleLookupReqId, - response: Result<(R::VerifiedResponseType, PeerGroup, Duration), RpcResponseError>, + response: BlockDownloadResponse, cx: &mut SyncNetworkContext, ) { - let result = self.on_download_response_inner::(id, response, cx); - self.on_lookup_result(id.lookup_id, result, "download_response", cx); + let Some(lookup) = self.single_block_lookups.get_mut(&id.lookup_id) else { + debug!(?id, "Block returned for single block lookup not present"); + return; + }; + let result = lookup.on_block_download_response(id.req_id, response, cx); + self.on_lookup_result(id.lookup_id, result, "block_download_response", cx); } - /// Process a block or blob response received from a single lookup request. - pub fn on_download_response_inner>( + pub fn on_custody_download_response( &mut self, id: SingleLookupReqId, - response: Result<(R::VerifiedResponseType, PeerGroup, Duration), RpcResponseError>, + response: CustodyDownloadResponse, cx: &mut SyncNetworkContext, - ) -> Result { - // Note: no need to downscore peers here, already downscored on network context - - let response_type = R::response_type(); + ) { let Some(lookup) = self.single_block_lookups.get_mut(&id.lookup_id) else { - // We don't have the ability to cancel in-flight RPC requests. So this can happen - // if we started this RPC request, and later saw the block/blobs via gossip. - debug!(?id, "Block returned for single block lookup not present"); - return Err(LookupRequestError::UnknownLookup); + debug!(?id, "Custody returned for single block lookup not present"); + return; }; - - let block_root = lookup.block_root(); - let request_state = R::request_state_mut(lookup) - .map_err(|e| LookupRequestError::BadState(e.to_owned()))? - .get_state_mut(); - - match response { - Ok((response, peer_group, seen_timestamp)) => { - debug!( - ?block_root, - ?id, - ?peer_group, - ?response_type, - "Received lookup download success" - ); - - // Here we could check if response extends a parent chain beyond its max length. - // However we defer that check to the handling of a processing error ParentUnknown. - // - // Here we could check if there's already a lookup for parent_root of `response`. In - // that case we know that sending the response for processing will likely result in - // a `ParentUnknown` error. However, for simplicity we choose to not implement this - // optimization. - - // Register the download peer here. Once we have received some data over the wire we - // attribute it to this peer for scoring latter regardless of how the request was - // done. - request_state.on_download_success( - id.req_id, - DownloadResult { - value: response, - block_root, - seen_timestamp, - peer_group, - }, - )?; - // continue_request will send for processing as the request state is AwaitingProcessing - } - Err(e) => { - // No need to log peer source here. When sending a DataColumnsByRoot request we log - // the peer and the request ID which is linked to this `id` value here. - debug!( - ?block_root, - ?id, - ?response_type, - error = ?e, - "Received lookup download failure" - ); - - request_state.on_download_failure(id.req_id)?; - // continue_request will retry a download as the request state is AwaitingDownload - } - } - - lookup.continue_requests(cx) + let result = lookup.on_custody_download_response(id.req_id, response, cx); + self.on_lookup_result(id.lookup_id, result, "custody_download_response", cx); } /* Error responses */ @@ -539,128 +457,29 @@ impl BlockLookups { result: BlockProcessingResult, cx: &mut SyncNetworkContext, ) { - let lookup_result = match process_type { - BlockProcessType::SingleBlock { id } => { - self.on_processing_result_inner::>(id, result, cx) - } - BlockProcessType::SingleCustodyColumn(id) => { - self.on_processing_result_inner::>(id, result, cx) - } - // TODO(gloas): route into the payload envelope lookup state machine. - BlockProcessType::SinglePayloadEnvelope(_) => Ok(LookupResult::Pending), - }; - self.on_lookup_result(process_type.id(), lookup_result, "processing_result", cx); - } - - pub fn on_processing_result_inner>( - &mut self, - lookup_id: SingleLookupId, - result: BlockProcessingResult, - cx: &mut SyncNetworkContext, - ) -> Result { + let lookup_id = process_type.id(); let Some(lookup) = self.single_block_lookups.get_mut(&lookup_id) else { debug!(id = lookup_id, "Unknown single block lookup"); - return Err(LookupRequestError::UnknownLookup); + return; }; - let block_root = lookup.block_root(); - let request_state = R::request_state_mut(lookup) - .map_err(|e| LookupRequestError::BadState(e.to_owned()))? - .get_state_mut(); - debug!( - component = ?R::response_type(), - ?block_root, + block_root = ?lookup.block_root(), id = lookup_id, + ?process_type, ?result, "Received lookup processing result" ); - let action = match result { - BlockProcessingResult::Imported(fully_imported, _info) => { - // `on_processing_success` is called here to ensure the request state is updated - // prior to checking if all components have been processed (relevant for - // MissingComponents). - request_state.on_processing_success()?; - - if fully_imported { - Action::Continue - } else if lookup.all_components_processed() { - // We don't request for other block components until being sure that the block has - // data. If we request blobs / columns to a peer we are sure those must exist. - // Therefore if all components are processed and we still receive `MissingComponents` - // it indicates an internal bug. - return Err(LookupRequestError::Failed( - "missing components after all processed".to_owned(), - )); - } else { - Action::Retry - } - } - BlockProcessingResult::ParentUnknown { parent_root } => { - // `BlockError::ParentUnknown` is only returned when processing blocks. Reverts - // the status of this request to `AwaitingProcessing` holding the downloaded - // data. A future call to `continue_requests` will re-submit it once there are - // no pending parent requests. - request_state.revert_to_awaiting_processing()?; - Action::ParentUnknown { parent_root } - } - BlockProcessingResult::Error { penalty, reason } => { - // Retry on every processing error: `on_processing_failure` increments the - // per-component failure counter, so `SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS` bounds the - // retry loop and eventually drops the lookup if the failure persists. Whether the - // peer should be downscored is the producer's call (encoded in `penalty`). - debug!( - ?block_root, - component = ?R::response_type(), - reason, - ?penalty, - "Lookup component processing failed; retrying" - ); - let peer_group = request_state.on_processing_failure()?; - if let Some((action_kind, whom, msg)) = penalty { - whom.apply(action_kind, &peer_group, msg, cx); - } - Action::Retry + let lookup_result = match process_type { + BlockProcessType::SingleBlock { .. } => lookup.on_block_processing_result(result, cx), + BlockProcessType::SingleCustodyColumn(_) => { + lookup.on_data_processing_result(result, cx) } + // TODO(gloas): route into the payload envelope lookup state machine. + BlockProcessType::SinglePayloadEnvelope(_) => Ok(LookupResult::Pending), }; - - match action { - Action::Retry => { - // Trigger download for all components in case `MissingComponents` failed the blob - // request. Also if blobs are `AwaitingProcessing` and need to be progressed - lookup.continue_requests(cx) - } - Action::ParentUnknown { parent_root } => { - let peers = lookup.all_peers(); - // Mark lookup as awaiting **before** creating the parent lookup. At this point the - // lookup maybe inconsistent. - lookup.set_awaiting_parent(parent_root); - let parent_lookup_exists = - self.search_parent_of_child(parent_root, block_root, &peers, cx); - if parent_lookup_exists { - // The parent lookup exist or has been created. It's safe for `lookup` to - // reference the parent as awaiting. - debug!( - id = lookup_id, - ?block_root, - ?parent_root, - "Marking lookup as awaiting parent" - ); - Ok(LookupResult::Pending) - } else { - // The parent lookup is faulty and was not created, we must drop the `lookup` as - // it's in an inconsistent state. We must drop all of its children too. - Err(LookupRequestError::Failed(format!( - "Parent lookup is faulty {parent_root:?}" - ))) - } - } - Action::Continue => { - // Drop this completed lookup only - Ok(LookupResult::Completed) - } - } + self.on_lookup_result(lookup_id, lookup_result, "processing_result", cx); } pub fn on_external_processing_result( @@ -757,7 +576,20 @@ impl BlockLookups { cx: &mut SyncNetworkContext, ) -> bool { match result { - Ok(LookupResult::Pending) => true, // no action + Ok(LookupResult::Pending) => true, + Ok(LookupResult::ParentUnknown { + parent_root, + block_root, + peers, + }) => { + if self.search_parent_of_child(parent_root, block_root, &peers, cx) { + true + } else { + self.drop_lookup_and_children(id, "Failed"); + self.update_metrics(); + false + } + } Ok(LookupResult::Completed) => { if let Some(lookup) = self.single_block_lookups.remove(&id) { debug!( @@ -923,6 +755,7 @@ impl BlockLookups { } /// Adds peers to a lookup and its ancestors recursively. + /// /// Note: Takes a `lookup_id` as argument to allow recursion on mutable lookups, without having /// to duplicate the code to add peers to a lookup fn add_peers_to_lookup_and_ancestors( @@ -949,12 +782,12 @@ impl BlockLookups { } if let Some(parent_root) = lookup.awaiting_parent() { - if let Some((&child_id, _)) = self + if let Some((&parent_id, _)) = self .single_block_lookups .iter() .find(|(_, l)| l.block_root() == parent_root) { - self.add_peers_to_lookup_and_ancestors(child_id, peers, cx) + self.add_peers_to_lookup_and_ancestors(parent_id, peers, cx) } else { Err(format!("Lookup references unknown parent {parent_root:?}")) } diff --git a/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs b/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs index 38d6b2216d..8eb58da4e6 100644 --- a/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs +++ b/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs @@ -1,15 +1,17 @@ use super::{BlockComponent, PeerId, SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS}; -use crate::sync::block_lookups::common::RequestState; +use crate::network_beacon_processor::BlockProcessingResult; +use crate::sync::block_lookups::{BlockDownloadResponse, CustodyDownloadResponse}; +use crate::sync::manager::BlockProcessType; use crate::sync::network_context::{ - LookupRequestResult, PeerGroup, ReqId, RpcRequestSendError, SendErrorProcessor, - SyncNetworkContext, + LookupRequestResult, PeerGroup, ReqId, RpcRequestSendError, RpcResponseError, + SendErrorProcessor, SyncNetworkContext, }; -use beacon_chain::{BeaconChainTypes, BlockProcessStatus}; +use beacon_chain::BeaconChainTypes; +use beacon_chain::block_verification_types::AsBlock; use educe::Educe; use lighthouse_network::service::api_types::Id; use parking_lot::RwLock; use std::collections::HashSet; -use std::fmt::Debug; use std::sync::Arc; use std::time::{Duration, Instant}; use store::Hash256; @@ -24,15 +26,18 @@ pub enum LookupResult { Completed, /// Lookup is expecting some future event from the network Pending, + /// Block's parent is not known to fork-choice, a parent lookup is needed + ParentUnknown { + parent_root: Hash256, + block_root: Hash256, + peers: Vec, + }, } #[derive(Debug, PartialEq, Eq, IntoStaticStr)] pub enum LookupRequestError { /// Too many failed attempts - TooManyAttempts { - /// The failed attempts were primarily due to processing failures. - cannot_process: bool, - }, + TooManyAttempts, /// Error sending event to network SendFailedNetwork(RpcRequestSendError), /// Error sending event to processor @@ -52,33 +57,63 @@ pub enum LookupRequestError { }, } +#[derive(Debug)] +struct BlockRequest { + state: SingleLookupRequestState>>, +} + +impl BlockRequest { + fn new() -> Self { + Self { + state: SingleLookupRequestState::new(), + } + } + + fn is_complete(&self) -> bool { + self.state.is_processed() + } +} + +#[derive(Debug)] +enum DataRequest { + WaitingForBlock, + Request { + slot: Slot, + state: SingleLookupRequestState>, + }, + NoData, +} + +impl DataRequest { + fn is_complete(&self) -> bool { + match &self { + DataRequest::WaitingForBlock => false, + DataRequest::Request { state, .. } => state.is_processed(), + DataRequest::NoData => true, + } + } +} + +type PeerSet = Arc>>; + #[derive(Educe)] #[educe(Debug(bound(T: BeaconChainTypes)))] pub struct SingleBlockLookup { pub id: Id, - pub block_request_state: BlockRequestState, - pub component_requests: ComponentRequests, + block_root: Hash256, + block_request: BlockRequest, + data_request: DataRequest, /// Peers that claim to have imported this set of block components. This state is shared with /// the custody request to have an updated view of the peers that claim to have imported the /// block associated with this lookup. The peer set of a lookup can change rapidly, and faster /// than the lifetime of a custody request. #[educe(Debug(method(fmt_peer_set_as_len)))] - peers: Arc>>, - block_root: Hash256, + peers: PeerSet, awaiting_parent: Option, created: Instant, pub(crate) span: Span, } -#[derive(Debug)] -pub(crate) enum ComponentRequests { - WaitingForBlock, - ActiveCustodyRequest(CustodyRequestState), - // When printing in debug this state display the reason why it's not needed - #[allow(dead_code)] - NotNeeded(&'static str), -} - impl SingleBlockLookup { pub fn new( requested_block_root: Hash256, @@ -94,25 +129,25 @@ impl SingleBlockLookup { Self { id, - block_request_state: BlockRequestState::new(requested_block_root), - component_requests: ComponentRequests::WaitingForBlock, - peers: Arc::new(RwLock::new(HashSet::from_iter(peers.iter().copied()))), block_root: requested_block_root, + block_request: BlockRequest::new(), + data_request: DataRequest::WaitingForBlock, + peers: Arc::new(RwLock::new(peers.iter().copied().collect())), awaiting_parent, created: Instant::now(), span: lookup_span, } } - /// Reset the status of all internal requests + /// Reset the status of all requests (used on block processing failure) pub fn reset_requests(&mut self) { - self.block_request_state = BlockRequestState::new(self.block_root); - self.component_requests = ComponentRequests::WaitingForBlock; + self.block_request = BlockRequest::new(); + self.data_request = DataRequest::WaitingForBlock; } - /// Return the slot of this lookup's block if it's currently cached as `AwaitingProcessing` + /// Return the slot of this lookup's block if it's currently cached pub fn peek_downloaded_block_slot(&self) -> Option { - self.block_request_state + self.block_request .state .peek_downloaded_data() .map(|block| block.slot()) @@ -147,15 +182,12 @@ impl SingleBlockLookup { /// Maybe insert a verified response into this lookup. Returns true if imported pub fn add_child_components(&mut self, block_component: BlockComponent) -> bool { match block_component { - BlockComponent::Block(block) => self - .block_request_state - .state - .insert_verified_response(block), - BlockComponent::Sidecar { .. } => { - // For now ignore single blobs and columns, as the blob request state assumes all blobs are - // attributed to the same peer = the peer serving the remaining blobs. Ignoring this - // block component has a minor effect, causing the node to re-request this blob - // once the parent chain is successfully resolved + BlockComponent::Block(block) => { + self.block_request.state.insert_verified_response(block) + } + BlockComponent::Sidecar => { + // There's nothing to do here, there's no component to insert. The lookup downloads + // its required data columns itself once it has the block. false } } @@ -166,29 +198,14 @@ impl SingleBlockLookup { self.block_root() == block_root } - /// Returns true if the block has already been downloaded. - pub fn all_components_processed(&self) -> bool { - self.block_request_state.state.is_processed() - && match &self.component_requests { - ComponentRequests::WaitingForBlock => false, - ComponentRequests::ActiveCustodyRequest(request) => request.state.is_processed(), - ComponentRequests::NotNeeded { .. } => true, - } - } - /// Returns true if this request is expecting some event to make progress pub fn is_awaiting_event(&self) -> bool { self.awaiting_parent.is_some() - || self.block_request_state.state.is_awaiting_event() - || match &self.component_requests { - // If components are waiting for the block request to complete, here we should - // check if the`block_request_state.state.is_awaiting_event(). However we already - // checked that above, so `WaitingForBlock => false` is equivalent. - ComponentRequests::WaitingForBlock => false, - ComponentRequests::ActiveCustodyRequest(request) => { - request.state.is_awaiting_event() - } - ComponentRequests::NotNeeded { .. } => false, + || self.block_request.state.is_awaiting_event() + || match &self.data_request { + DataRequest::WaitingForBlock => true, + DataRequest::Request { state, .. } => state.is_awaiting_event(), + DataRequest::NoData => false, } } @@ -199,139 +216,167 @@ impl SingleBlockLookup { cx: &mut SyncNetworkContext, ) -> Result { let _guard = self.span.clone().entered(); - // TODO: Check what's necessary to download, specially for blobs - self.continue_request::>(cx, 0)?; - if let ComponentRequests::WaitingForBlock = self.component_requests { - let downloaded_block = self - .block_request_state - .state - .peek_downloaded_data() - .cloned(); - - if let Some(block) = downloaded_block.or_else(|| { - // If the block is already being processed or fully validated, retrieve how many blobs - // it expects. Consider any stage of the block. If the block root has been validated, we - // can assert that this is the correct value of `blob_kzg_commitments_count`. - match cx.chain.get_block_process_status(&self.block_root) { - BlockProcessStatus::Unknown => None, - BlockProcessStatus::NotValidated(block, _) - | BlockProcessStatus::ExecutionValidated(block) => Some(block.clone()), - } - }) { - let expected_blobs = block.num_expected_blobs(); - let block_epoch = block.slot().epoch(T::EthSpec::slots_per_epoch()); - if expected_blobs == 0 { - self.component_requests = ComponentRequests::NotNeeded("no data"); - } else if cx.chain.should_fetch_custody_columns(block_epoch) { - self.component_requests = ComponentRequests::ActiveCustodyRequest( - CustodyRequestState::new(self.block_root, block.slot()), - ); - } else { - self.component_requests = ComponentRequests::NotNeeded("outside da window"); - } - } else { - // Wait to download the block before downloading blobs. Then we can be sure that the - // block has data, so there's no need to do "blind" requests for all possible blobs and - // latter handle the case where if the peer sent no blobs, penalize. - // - // Lookup sync event safety: Reaching this code means that a block is not in any pre-import - // cache nor in the request state of this lookup. Therefore, the block must either: (1) not - // be downloaded yet or (2) the block is already imported into the fork-choice. - // In case (1) the lookup must either successfully download the block or get dropped. - // In case (2) the block will be downloaded, processed, reach `DuplicateFullyImported` - // and get dropped as completed. - } + // === Block request === + self.block_request.state.maybe_start_downloading(|| { + cx.block_lookup_request(self.id, self.peers.clone(), self.block_root) + })?; + if self.awaiting_parent.is_none() + && let Some(data) = self.block_request.state.maybe_start_processing() + { + cx.send_block_for_processing(self.id, self.block_root, data.value, data.seen_timestamp) + .map_err(LookupRequestError::SendFailedProcessor)?; } - match &self.component_requests { - ComponentRequests::WaitingForBlock => {} // do nothing - ComponentRequests::ActiveCustodyRequest(_) => { - self.continue_request::>(cx, 0)? + // === Data request === + loop { + match &mut self.data_request { + DataRequest::WaitingForBlock => { + if let Some(block) = self.block_request.state.peek_downloaded_data() { + let block_epoch = block + .slot() + .epoch(::EthSpec::slots_per_epoch()); + self.data_request = if block.num_expected_blobs() == 0 { + DataRequest::NoData + } else if cx.chain.should_fetch_custody_columns(block_epoch) { + DataRequest::Request { + slot: block.slot(), + state: SingleLookupRequestState::new(), + } + } else { + DataRequest::NoData + }; + } else { + break; + } + } + DataRequest::Request { slot, state } => { + state.maybe_start_downloading(|| { + cx.custody_lookup_request( + self.id, + self.block_root, + *slot, + self.peers.clone(), + ) + })?; + // Wait for the parent to be imported, data column processing result handle does + // not support `ParentUnknown`. + if self.awaiting_parent.is_none() + && let Some(data) = state.maybe_start_processing() + { + cx.send_custody_columns_for_processing( + self.id, + self.block_root, + data.value, + data.seen_timestamp, + BlockProcessType::SingleCustodyColumn(self.id), + ) + .map_err(LookupRequestError::SendFailedProcessor)?; + } + break; + } + DataRequest::NoData => break, } - ComponentRequests::NotNeeded { .. } => {} // do nothing } // If all components of this lookup are already processed, there will be no future events // that can make progress so it must be dropped. Consider the lookup completed. // This case can happen if we receive the components from gossip during a retry. - if self.all_components_processed() { - self.span = Span::none(); - Ok(LookupResult::Completed) - } else { - Ok(LookupResult::Pending) + if self.block_request.is_complete() && self.data_request.is_complete() { + return Ok(LookupResult::Completed); } + + Ok(LookupResult::Pending) } - /// Potentially makes progress on this request if it's in a progress-able state - fn continue_request>( + /// Handle block processing result. Advances the lookup state machine. + pub fn on_block_processing_result( &mut self, + result: BlockProcessingResult, cx: &mut SyncNetworkContext, - expected_blobs: usize, - ) -> Result<(), LookupRequestError> { - let id = self.id; - let awaiting_parent = self.awaiting_parent.is_some(); - let request = - R::request_state_mut(self).map_err(|e| LookupRequestError::BadState(e.to_owned()))?; - - // Attempt to progress awaiting downloads - if request.get_state().is_awaiting_download() { - // Verify the current request has not exceeded the maximum number of attempts. - let request_state = request.get_state(); - if request_state.failed_attempts() >= SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS { - let cannot_process = request_state.more_failed_processing_attempts(); - return Err(LookupRequestError::TooManyAttempts { cannot_process }); + ) -> Result { + match result { + BlockProcessingResult::Imported(_fully_imported, _info) => { + self.block_request.state.on_processing_success()?; } - - let peers = self.peers.clone(); - let request = R::request_state_mut(self) - .map_err(|e| LookupRequestError::BadState(e.to_owned()))?; - - match request.make_request(id, peers, expected_blobs, cx)? { - LookupRequestResult::RequestSent(req_id) => { - // Lookup sync event safety: If make_request returns `RequestSent`, we are - // guaranteed that `BlockLookups::on_download_response` will be called exactly - // with this `req_id`. - request.get_state_mut().on_download_start(req_id)? - } - LookupRequestResult::NoRequestNeeded(reason) => { - // Lookup sync event safety: Advances this request to the terminal `Processed` - // state. If all requests reach this state, the request is marked as completed - // in `Self::continue_requests`. - request.get_state_mut().on_completed_request(reason)? - } - // Sync will receive a future event to make progress on the request, do nothing now - LookupRequestResult::Pending(reason) => { - // Lookup sync event safety: Refer to the code paths constructing - // `LookupRequestResult::Pending` - request - .get_state_mut() - .update_awaiting_download_status(reason); - return Ok(()); + BlockProcessingResult::ParentUnknown { parent_root } => { + // `BlockError::ParentUnknown` is only returned when processing blocks. Revert the + // block request to `Downloaded` and park this lookup until the parent resolves; a + // future call to `continue_requests` will re-submit the block for processing once + // the parent lookup completes. + self.block_request.state.revert_to_awaiting_processing()?; + self.set_awaiting_parent(parent_root); + return Ok(LookupResult::ParentUnknown { + parent_root, + block_root: self.block_root, + peers: self.all_peers(), + }); + } + BlockProcessingResult::Error { penalty, .. } => { + let peers = self.block_request.state.on_processing_failure()?; + if let Some((action, whom, msg)) = penalty { + whom.apply(action, &peers, msg, cx); } } - - // Otherwise, attempt to progress awaiting processing - // If this request is awaiting a parent lookup to be processed, do not send for processing. - // The request will be rejected with unknown parent error. - } else if !awaiting_parent { - // maybe_start_processing returns Some if state == AwaitingProcess. This pattern is - // useful to conditionally access the result data. - if let Some(result) = request.get_state_mut().maybe_start_processing() { - // Lookup sync event safety: If `send_for_processing` returns Ok() we are guaranteed - // that `BlockLookups::on_processing_result` will be called exactly once with this - // lookup_id - return R::send_for_processing(id, result, cx); - } - // Lookup sync event safety: If the request is not in `AwaitingDownload` or - // `AwaitingProcessing` state it is guaranteed to receive some event to make progress. } + self.continue_requests(cx) + } - // Lookup sync event safety: If a lookup is awaiting a parent we are guaranteed to either: - // (1) attempt to make progress with `BlockLookups::continue_child_lookups` if the parent - // lookup completes, or (2) get dropped if the parent fails and is dropped. + /// Handle data processing result + pub fn on_data_processing_result( + &mut self, + result: BlockProcessingResult, + cx: &mut SyncNetworkContext, + ) -> Result { + let DataRequest::Request { state, .. } = &mut self.data_request else { + return Err(LookupRequestError::BadState("no data_request".to_owned())); + }; - Ok(()) + match result { + BlockProcessingResult::Imported(_fully_imported, _info) => { + state.on_processing_success()?; + } + BlockProcessingResult::ParentUnknown { .. } => { + return Err(LookupRequestError::BadState( + "data processing returned ParentUnknown".to_owned(), + )); + } + BlockProcessingResult::Error { penalty, .. } => { + let peers = state.on_processing_failure()?; + if let Some((action, whom, msg)) = penalty { + whom.apply(action, &peers, msg, cx); + } + } + } + self.continue_requests(cx) + } + + /// Handle a block download response. Updates download state and advances the lookup. + pub fn on_block_download_response( + &mut self, + req_id: ReqId, + result: BlockDownloadResponse, + cx: &mut SyncNetworkContext, + ) -> Result { + self.block_request + .state + .on_download_response(req_id, result)?; + self.continue_requests(cx) + } + + /// Handle a custody columns download response. Updates download state and advances the lookup. + pub fn on_custody_download_response( + &mut self, + req_id: ReqId, + result: CustodyDownloadResponse, + cx: &mut SyncNetworkContext, + ) -> Result { + let DataRequest::Request { state, .. } = &mut self.data_request else { + return Err(LookupRequestError::BadState("no data_request".to_owned())); + }; + + state.on_download_response(req_id, result)?; + self.continue_requests(cx) } /// Get all unique peers that claim to have imported this set of block components @@ -340,7 +385,7 @@ impl SingleBlockLookup { } /// Add peer to all request states. The peer must be able to serve this request. - /// Returns true if the peer was newly inserted into some request state. + /// Returns true if the peer was newly inserted into any peer set. pub fn add_peer(&mut self, peer_id: PeerId) -> bool { self.peers.write().insert(peer_id) } @@ -356,52 +401,23 @@ impl SingleBlockLookup { } } -/// The state of the custody request component of a `SingleBlockLookup`. -#[derive(Educe)] -#[educe(Debug)] -pub struct CustodyRequestState { - #[educe(Debug(ignore))] - pub block_root: Hash256, - pub slot: Slot, - pub state: SingleLookupRequestState>, -} - -impl CustodyRequestState { - pub fn new(block_root: Hash256, slot: Slot) -> Self { - Self { - block_root, - slot, - state: SingleLookupRequestState::new(), - } - } -} - -/// The state of the block request component of a `SingleBlockLookup`. -#[derive(Educe)] -#[educe(Debug)] -pub struct BlockRequestState { - #[educe(Debug(ignore))] - pub requested_block_root: Hash256, - pub state: SingleLookupRequestState>>, -} - -impl BlockRequestState { - pub fn new(block_root: Hash256) -> Self { - Self { - requested_block_root: block_root, - state: SingleLookupRequestState::new(), - } - } -} - #[derive(Debug, Clone)] pub struct DownloadResult { pub value: T, - pub block_root: Hash256, pub seen_timestamp: Duration, pub peer_group: PeerGroup, } +impl DownloadResult { + pub fn new(value: T, peer_group: PeerGroup, seen_timestamp: Duration) -> Self { + Self { + value, + seen_timestamp, + peer_group, + } + } +} + #[derive(IntoStaticStr)] pub enum State { AwaitingDownload(/* reason */ &'static str), @@ -410,7 +426,7 @@ pub enum State { /// Request is processing, sent by lookup sync Processing(DownloadResult), /// Request is processed - Processed(/* reason */ &'static str), + Processed(/* reason */ &'static str, T), } /// Object representing the state of a single block or blob lookup request. @@ -477,10 +493,29 @@ impl SingleLookupRequestState { State::Downloading { .. } => None, State::AwaitingProcess(result) => Some(&result.value), State::Processing(result) => Some(&result.value), - State::Processed { .. } => None, + State::Processed(_, value) => Some(value), } } + /// Drive download: check max attempts, issue request, handle result. + fn maybe_start_downloading( + &mut self, + request_fn: impl FnOnce() -> Result, RpcRequestSendError>, + ) -> Result<(), LookupRequestError> { + if self.is_awaiting_download() { + match request_fn().map_err(LookupRequestError::SendFailedNetwork)? { + LookupRequestResult::RequestSent(req_id) => self.on_download_start(req_id)?, + LookupRequestResult::NoRequestNeeded(reason, value) => { + self.on_completed_request(reason, value)? + } + LookupRequestResult::Pending(reason) => { + self.update_awaiting_download_status(reason) + } + } + } + Ok(()) + } + /// Switch to `AwaitingProcessing` if the request is in `AwaitingDownload` state, otherwise /// ignore. pub fn insert_verified_response(&mut self, result: DownloadResult) -> bool { @@ -513,6 +548,17 @@ impl SingleLookupRequestState { } } + pub fn on_download_response( + &mut self, + req_id: ReqId, + result: Result, RpcResponseError>, + ) -> Result<(), LookupRequestError> { + match result { + Ok(result) => self.on_download_success(req_id, result), + Err(_) => self.on_download_failure(req_id), + } + } + /// Registers a failure in downloading a block. This might be a peer disconnection or a wrong /// block. pub fn on_download_failure(&mut self, req_id: ReqId) -> Result<(), LookupRequestError> { @@ -525,6 +571,10 @@ impl SingleLookupRequestState { }); } self.failed_downloading = self.failed_downloading.saturating_add(1); + if self.failed_downloading >= SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS { + return Err(LookupRequestError::TooManyAttempts); + } + self.state = State::AwaitingDownload("not started"); Ok(()) } @@ -589,6 +639,9 @@ impl SingleLookupRequestState { State::Processing(result) => { let peers_source = result.peer_group.clone(); self.failed_processing = self.failed_processing.saturating_add(1); + if self.failed_processing >= SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS { + return Err(LookupRequestError::TooManyAttempts); + } self.state = State::AwaitingDownload("not started"); Ok(peers_source) } @@ -600,8 +653,8 @@ impl SingleLookupRequestState { pub fn on_processing_success(&mut self) -> Result<(), LookupRequestError> { match &self.state { - State::Processing(_) => { - self.state = State::Processed("processing success"); + State::Processing(data) => { + self.state = State::Processed("processing success", data.value.clone()); Ok(()) } other => Err(LookupRequestError::BadState(format!( @@ -611,10 +664,14 @@ impl SingleLookupRequestState { } /// Mark a request as complete without any download or processing - pub fn on_completed_request(&mut self, reason: &'static str) -> Result<(), LookupRequestError> { + pub fn on_completed_request( + &mut self, + reason: &'static str, + value: T, + ) -> Result<(), LookupRequestError> { match &self.state { State::AwaitingDownload { .. } => { - self.state = State::Processed(reason); + self.state = State::Processed(reason, value); Ok(()) } other => Err(LookupRequestError::BadState(format!( @@ -622,15 +679,6 @@ impl SingleLookupRequestState { ))), } } - - /// The total number of failures, whether it be processing or downloading. - pub fn failed_attempts(&self) -> u8 { - self.failed_processing + self.failed_downloading - } - - pub fn more_failed_processing_attempts(&self) -> bool { - self.failed_processing >= self.failed_downloading - } } // Display is used in the BadState assertions above @@ -647,15 +695,15 @@ impl std::fmt::Debug for State { match self { Self::AwaitingDownload(reason) => write!(f, "AwaitingDownload({})", reason), Self::Downloading(req_id) => write!(f, "Downloading({:?})", req_id), - Self::AwaitingProcess(d) => write!(f, "AwaitingProcess({:?})", d.peer_group), - Self::Processing(d) => write!(f, "Processing({:?})", d.peer_group), - Self::Processed(reason) => write!(f, "Processed({})", reason), + Self::AwaitingProcess(_) => write!(f, "AwaitingProcess"), + Self::Processing(_) => write!(f, "Processing"), + Self::Processed(reason, _) => write!(f, "Processed({})", reason), } } } fn fmt_peer_set_as_len( - peer_set: &Arc>>, + peer_set: &PeerSet, f: &mut std::fmt::Formatter, ) -> Result<(), std::fmt::Error> { write!(f, "{}", peer_set.read().len()) diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 37d13c2eae..166c65b6e1 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -45,9 +45,7 @@ use crate::network_beacon_processor::{ }; use crate::service::NetworkMessage; use crate::status::ToStatusMessage; -use crate::sync::block_lookups::{ - BlockComponent, BlockRequestState, CustodyRequestState, DownloadResult, -}; +use crate::sync::block_lookups::{BlockComponent, DownloadResult}; use crate::sync::custody_backfill_sync::CustodyBackFillSync; use crate::sync::network_context::{PeerGroup, RpcResponseResult}; use beacon_chain::block_verification_types::AsBlock; @@ -867,7 +865,6 @@ impl SyncManager { block_slot, BlockComponent::Block(DownloadResult { value: block.block_cloned(), - block_root, seen_timestamp: self.chain.slot_clock.now_duration().unwrap_or_default(), peer_group: PeerGroup::from_single(peer_id), }), @@ -885,7 +882,7 @@ impl SyncManager { block_root, parent_root, slot, - BlockComponent::Sidecar { parent_root }, + BlockComponent::Sidecar, ); } SyncMessage::UnknownBlockHashFromAttestation(peer_id, block_root) => { @@ -975,6 +972,7 @@ impl SyncManager { if self.block_lookups.search_child_and_parent( block_root, block_component, + parent_root, peer_id, &mut self.network, ) { @@ -1125,14 +1123,13 @@ impl SyncManager { block: RpcEvent>>, ) { if let Some(resp) = self.network.on_single_block_response(id, peer_id, block) { - self.block_lookups - .on_download_response::>( - id, - resp.map(|(value, seen_timestamp)| { - (value, PeerGroup::from_single(peer_id), seen_timestamp) - }), - &mut self.network, - ) + self.block_lookups.on_block_download_response( + id, + resp.map(|(value, seen_timestamp)| { + DownloadResult::new(value, PeerGroup::from_single(peer_id), seen_timestamp) + }), + &mut self.network, + ) } } @@ -1308,11 +1305,7 @@ impl SyncManager { response: CustodyByRootResult, ) { self.block_lookups - .on_download_response::>( - requester.0, - response, - &mut self.network, - ); + .on_custody_download_response(requester.0, response, &mut self.network); } /// Handles receiving a response for a range sync request that should have both blocks and diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 95ae84755c..1e35c0a72f 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -16,7 +16,7 @@ use crate::network_beacon_processor::TestBeaconChainType; use crate::service::NetworkMessage; use crate::status::ToStatusMessage; use crate::sync::batch::ByRangeRequestType; -use crate::sync::block_lookups::SingleLookupId; +use crate::sync::block_lookups::{DownloadResult, SingleLookupId}; use crate::sync::block_sidecar_coupling::CouplingError; use crate::sync::range_data_column_batch_request::RangeDataColumnBatchRequest; use beacon_chain::block_verification_types::LookupBlock; @@ -95,7 +95,7 @@ pub type RpcResponseResult = Result<(T, Duration), RpcResponseError>; /// Duration = latest seen timestamp of all received data columns pub type CustodyByRootResult = - Result<(DataColumnSidecarList, PeerGroup, Duration), RpcResponseError>; + Result>, RpcResponseError>; #[derive(Debug)] pub enum RpcResponseError { @@ -176,13 +176,13 @@ impl PeerGroup { /// Sequential ID that uniquely identifies ReqResp outgoing requests pub type ReqId = u32; -pub enum LookupRequestResult { +pub enum LookupRequestResult { /// A request is sent. Sync MUST receive an event from the network in the future for either: /// completed response or failed request RequestSent(I), /// No request is sent, and no further action is necessary to consider this request completed. /// Includes a reason why this request is not needed. - NoRequestNeeded(&'static str), + NoRequestNeeded(&'static str, T), /// No request is sent, but the request is not completed. Sync MUST receive some future event /// that makes progress on the request. For example: request is processing from a different /// source (i.e. block received from gossip) and sync MUST receive an event with that processing @@ -820,7 +820,7 @@ impl SyncNetworkContext { lookup_id: SingleLookupId, lookup_peers: Arc>>, block_root: Hash256, - ) -> Result { + ) -> Result>>, RpcRequestSendError> { let active_request_count_by_peer = self.active_request_count_by_peer(); let Some(peer_id) = lookup_peers .read() @@ -871,9 +871,10 @@ impl SyncNetworkContext { }, // Block is fully validated. If it's not yet imported it's waiting for missing block // components. Consider this request completed and do nothing. - BlockProcessStatus::ExecutionValidated { .. } => { + BlockProcessStatus::ExecutionValidated(block) => { return Ok(LookupRequestResult::NoRequestNeeded( "block execution validated", + block, )); } } @@ -937,12 +938,13 @@ impl SyncNetworkContext { lookup_id: SingleLookupId, lookup_peers: Arc>>, block_root: Hash256, - ) -> Result { + ) -> Result, RpcRequestSendError> { // Skip the download if fork-choice already saw this envelope (e.g. imported via gossip // before the lookup got here). if self.chain.envelope_is_known_to_fork_choice(&block_root) { return Ok(LookupRequestResult::NoRequestNeeded( "envelope already known to fork-choice", + (), )); } @@ -1011,7 +1013,7 @@ impl SyncNetworkContext { peer_id: PeerId, request: DataColumnsByRootSingleBlockRequest, expect_max_responses: bool, - ) -> Result, &'static str> { + ) -> Result, &'static str> { let id = DataColumnsByRootRequestId { id: self.next_id(), requester, @@ -1060,7 +1062,7 @@ impl SyncNetworkContext { block_root: Hash256, block_slot: Slot, lookup_peers: Arc>>, - ) -> Result { + ) -> Result>, RpcRequestSendError> { let custody_indexes_imported = self .chain .cached_data_column_indexes(&block_root, block_slot) @@ -1078,7 +1080,10 @@ impl SyncNetworkContext { if custody_indexes_to_fetch.is_empty() { // No indexes required, do not issue any request - return Ok(LookupRequestResult::NoRequestNeeded("no indices to fetch")); + return Ok(LookupRequestResult::NoRequestNeeded( + "no indices to fetch", + vec![], + )); } let id = SingleLookupReqId { @@ -1528,8 +1533,8 @@ impl SyncNetworkContext { // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to // an Option first to use in an `if let Some() { act on result }` block. match result.as_ref() { - Some(Ok((columns, peer_group, _))) => { - debug!(?id, count = columns.len(), peers = ?peer_group, "Custody request success, removing") + Some(Ok(data)) => { + debug!(?id, count = data.value.len(), peers = ?data.peer_group, "Custody request success, removing") } Some(Err(e)) => { debug!(?id, error = ?e, "Custody request failure, removing" ) diff --git a/beacon_node/network/src/sync/network_context/custody.rs b/beacon_node/network/src/sync/network_context/custody.rs index 2b96800e37..e74b74ec08 100644 --- a/beacon_node/network/src/sync/network_context/custody.rs +++ b/beacon_node/network/src/sync/network_context/custody.rs @@ -1,3 +1,4 @@ +use crate::sync::block_lookups::DownloadResult; use crate::sync::network_context::{ DataColumnsByRootRequestId, DataColumnsByRootSingleBlockRequest, }; @@ -56,8 +57,7 @@ struct ActiveBatchColumnsRequest { span: Span, } -pub type CustodyRequestResult = - Result, PeerGroup, Duration)>, Error>; +pub type CustodyRequestResult = Result>>, Error>; impl ActiveCustodyRequest { pub(crate) fn new( @@ -227,7 +227,11 @@ impl ActiveCustodyRequest { .into_iter() .max() .unwrap_or_else(|| cx.chain.slot_clock.now_duration().unwrap_or_default()); - return Ok(Some((columns, peer_group, max_seen_timestamp))); + return Ok(Some(DownloadResult::new( + columns, + peer_group, + max_seen_timestamp, + ))); } let active_request_count_by_peer = cx.active_request_count_by_peer(); @@ -343,7 +347,7 @@ impl ActiveCustodyRequest { }, ); } - LookupRequestResult::NoRequestNeeded(_) => unreachable!(), + LookupRequestResult::NoRequestNeeded(..) => unreachable!(), LookupRequestResult::Pending(_) => unreachable!(), } } From d617c826fe6c4983cd883d50b7f4df5bce31304f Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Wed, 3 Jun 2026 09:07:41 -0700 Subject: [PATCH 4/4] Gloas data column reprocess queue (#9339) When debugging ePBS with columns, we noticed that columns arriving before their block dont pass gossip verification checks and are dropped. This PR ensures that columns arriving before the block are sent to the reprocess queue. Once their block arrives, they are reprocessed. This isn't an issue pre-gloas because we don't make block root checks for fulu data columns. This allows us to gossip verify the column and send it to the DA cache before the block arrives. I think we also need to handle this edge case for partial data columns. Theres an existing TODO for that already. Co-Authored-By: Eitan Seri-Levi --- beacon_node/beacon_processor/src/lib.rs | 23 +- .../src/scheduler/work_queue.rs | 7 + .../src/scheduler/work_reprocessing_queue.rs | 216 +++++++++++++++--- .../gossip_methods.rs | 46 +++- .../src/network_beacon_processor/mod.rs | 2 + .../src/network_beacon_processor/tests.rs | 1 + beacon_node/network/src/router.rs | 1 + 7 files changed, 248 insertions(+), 48 deletions(-) diff --git a/beacon_node/beacon_processor/src/lib.rs b/beacon_node/beacon_processor/src/lib.rs index af3ff09c8a..d6233ebaf9 100644 --- a/beacon_node/beacon_processor/src/lib.rs +++ b/beacon_node/beacon_processor/src/lib.rs @@ -41,8 +41,8 @@ pub use crate::scheduler::BeaconProcessorQueueLengths; use crate::scheduler::work_queue::WorkQueues; use crate::work_reprocessing_queue::{ - QueuedBackfillBatch, QueuedColumnReconstruction, QueuedGossipBlock, QueuedGossipEnvelope, - ReprocessQueueMessage, + QueuedBackfillBatch, QueuedColumnReconstruction, QueuedGossipBlock, QueuedGossipDataColumn, + QueuedGossipEnvelope, ReprocessQueueMessage, }; use futures::stream::{Stream, StreamExt}; use futures::task::Poll; @@ -304,6 +304,10 @@ impl From for WorkEvent { work: Work::ColumnReconstruction(process_fn), } } + ReadyWork::DataColumn(QueuedGossipDataColumn { process_fn, .. }) => Self { + drop_during_sync: true, + work: Work::UnknownBlockDataColumn { process_fn }, + }, } } } @@ -369,6 +373,9 @@ pub enum Work { UnknownBlockAttestation { process_fn: BlockingFn, }, + UnknownBlockDataColumn { + process_fn: BlockingFn, + }, GossipAttestationBatch { attestations: GossipAttestationBatch, process_batch: Box, @@ -464,6 +471,7 @@ pub enum WorkType { GossipAttestation, GossipAttestationToConvert, UnknownBlockAttestation, + UnknownBlockDataColumn, GossipAttestationBatch, GossipAggregate, UnknownBlockAggregate, @@ -569,6 +577,7 @@ impl Work { Work::LightClientFinalityUpdateRequest(_) => WorkType::LightClientFinalityUpdateRequest, Work::LightClientUpdatesByRangeRequest(_) => WorkType::LightClientUpdatesByRangeRequest, Work::UnknownBlockAttestation { .. } => WorkType::UnknownBlockAttestation, + Work::UnknownBlockDataColumn { .. } => WorkType::UnknownBlockDataColumn, Work::UnknownBlockAggregate { .. } => WorkType::UnknownBlockAggregate, Work::UnknownLightClientOptimisticUpdate { .. } => { WorkType::UnknownLightClientOptimisticUpdate @@ -842,6 +851,9 @@ impl BeaconProcessor { Some(item) } else if let Some(item) = work_queues.gossip_data_column_queue.pop() { Some(item) + } else if let Some(item) = work_queues.unknown_block_data_column_queue.pop() + { + Some(item) } else if let Some(item) = work_queues.gossip_partial_data_column_queue.pop() { @@ -1238,6 +1250,9 @@ impl BeaconProcessor { Work::UnknownBlockAttestation { .. } => { work_queues.unknown_block_attestation_queue.push(work) } + Work::UnknownBlockDataColumn { .. } => work_queues + .unknown_block_data_column_queue + .push(work, work_id), Work::UnknownBlockAggregate { .. } => { work_queues.unknown_block_aggregate_queue.push(work) } @@ -1288,6 +1303,9 @@ impl BeaconProcessor { WorkType::UnknownBlockAttestation => { work_queues.unknown_block_attestation_queue.len() } + WorkType::UnknownBlockDataColumn => { + work_queues.unknown_block_data_column_queue.len() + } WorkType::GossipAttestationBatch => 0, // No queue WorkType::GossipAggregate => work_queues.aggregate_queue.len(), WorkType::UnknownBlockAggregate => { @@ -1504,6 +1522,7 @@ impl BeaconProcessor { }), Work::UnknownBlockAttestation { process_fn } | Work::UnknownBlockAggregate { process_fn } + | Work::UnknownBlockDataColumn { process_fn } | Work::UnknownLightClientOptimisticUpdate { process_fn, .. } => { task_spawner.spawn_blocking(process_fn) } diff --git a/beacon_node/beacon_processor/src/scheduler/work_queue.rs b/beacon_node/beacon_processor/src/scheduler/work_queue.rs index ebd66e743d..cc03feac51 100644 --- a/beacon_node/beacon_processor/src/scheduler/work_queue.rs +++ b/beacon_node/beacon_processor/src/scheduler/work_queue.rs @@ -111,6 +111,7 @@ pub struct BeaconProcessorQueueLengths { attestation_queue: usize, unknown_block_aggregate_queue: usize, unknown_block_attestation_queue: usize, + unknown_block_data_column_queue: usize, sync_message_queue: usize, sync_contribution_queue: usize, gossip_voluntary_exit_queue: usize, @@ -174,6 +175,8 @@ impl BeaconProcessorQueueLengths { Ok(Self { aggregate_queue: 4096, unknown_block_aggregate_queue: 1024, + // Capacity for two slot's worth of data columns for a supernode. + unknown_block_data_column_queue: 256, // Capacity for a full slot's worth of attestations if subscribed to all subnets attestation_queue: std::cmp::max( active_validator_count / slots_per_epoch, @@ -245,6 +248,7 @@ pub struct WorkQueues { pub attestation_debounce: TimeLatch, pub unknown_block_aggregate_queue: LifoQueue>, pub unknown_block_attestation_queue: LifoQueue>, + pub unknown_block_data_column_queue: FifoQueue>, pub sync_message_queue: LifoQueue>, pub sync_contribution_queue: LifoQueue>, pub gossip_voluntary_exit_queue: FifoQueue>, @@ -302,6 +306,8 @@ impl WorkQueues { LifoQueue::new(queue_lengths.unknown_block_aggregate_queue); let unknown_block_attestation_queue = LifoQueue::new(queue_lengths.unknown_block_attestation_queue); + let unknown_block_data_column_queue = + FifoQueue::new(queue_lengths.unknown_block_data_column_queue); let sync_message_queue = LifoQueue::new(queue_lengths.sync_message_queue); let sync_contribution_queue = LifoQueue::new(queue_lengths.sync_contribution_queue); @@ -383,6 +389,7 @@ impl WorkQueues { attestation_debounce, unknown_block_aggregate_queue, unknown_block_attestation_queue, + unknown_block_data_column_queue, sync_message_queue, sync_contribution_queue, gossip_voluntary_exit_queue, diff --git a/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs b/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs index b1fa56af01..62ed86fbad 100644 --- a/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs +++ b/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs @@ -52,6 +52,10 @@ pub const QUEUED_ATTESTATION_DELAY: Duration = Duration::from_secs(12); /// For how long to queue light client updates for re-processing. pub const QUEUED_LIGHT_CLIENT_UPDATE_DELAY: Duration = Duration::from_secs(12); +/// Data column timeout as a multiplier of slot duration. Columns waiting for their block will be +/// sent for processing after this many slots worth of time, even if the block hasn't arrived. +const QUEUED_DATA_COLUMN_DELAY_SLOTS: u32 = 1; + /// Envelope timeout as a multiplier of slot duration. Envelopes waiting for their block will be /// sent for processing after this many slots worth of time, even if the block hasn't arrived. const QUEUED_ENVELOPE_DELAY_SLOTS: u32 = 1; @@ -76,6 +80,9 @@ const MAXIMUM_QUEUED_ENVELOPES: usize = 16; /// How many attestations we keep before new ones get dropped. const MAXIMUM_QUEUED_ATTESTATIONS: usize = 16_384; +/// How many columns we keep before new ones get dropped. +const MAXIMUM_QUEUED_DATA_COLUMNS: usize = 256; + /// How many light client updates we keep before new ones get dropped. const MAXIMUM_QUEUED_LIGHT_CLIENT_UPDATES: usize = 128; @@ -123,6 +130,8 @@ pub enum ReprocessQueueMessage { UnknownLightClientOptimisticUpdate(QueuedLightClientUpdate), /// A new backfill batch that needs to be scheduled for processing. BackfillSync(QueuedBackfillBatch), + /// A gossip data column that references an unknown block. + UnknownBlockDataColumn(QueuedGossipDataColumn), /// A delayed column reconstruction that needs checking DelayColumnReconstruction(QueuedColumnReconstruction), } @@ -138,6 +147,7 @@ pub enum ReadyWork { LightClientUpdate(QueuedLightClientUpdate), BackfillSync(QueuedBackfillBatch), ColumnReconstruction(QueuedColumnReconstruction), + DataColumn(QueuedGossipDataColumn), } /// An Attestation for which the corresponding block was not seen while processing, queued for @@ -200,6 +210,12 @@ pub struct QueuedColumnReconstruction { pub process_fn: AsyncFn, } +/// A gossip data column that references an unknown block, queued for later reprocessing. +pub struct QueuedGossipDataColumn { + pub beacon_block_root: Hash256, + pub process_fn: BlockingFn, +} + impl TryFrom> for QueuedBackfillBatch { type Error = WorkEvent; @@ -240,6 +256,8 @@ enum InboundEvent { ReadyBackfillSync(QueuedBackfillBatch), /// A column reconstruction that was queued is ready for processing. ReadyColumnReconstruction(QueuedColumnReconstruction), + /// A gossip data column that is ready for re-processing. + ReadyDataColumn(Hash256), /// A message sent to the `ReprocessQueue` Msg(ReprocessQueueMessage), } @@ -264,6 +282,8 @@ struct ReprocessQueue { lc_updates_delay_queue: DelayQueue, /// Queue to manage scheduled column reconstructions. column_reconstructions_delay_queue: DelayQueue, + /// Queue to manage gossip data column timeouts. + data_columns_delay_queue: DelayQueue, /* Queued items */ /// Queued blocks. @@ -284,6 +304,10 @@ struct ReprocessQueue { queued_column_reconstructions: HashMap>, /// Queued backfill batches queued_backfill_batches: Vec, + /// Queued gossip data columns awaiting their block, keyed by block root. + awaiting_data_columns_per_root: HashMap, DelayKey)>, + /// Total number of queued gossip data columns across all roots. + queued_data_columns_count: usize, /* Aux */ /// Next attestation id, used for both aggregated and unaggregated attestations @@ -294,6 +318,7 @@ struct ReprocessQueue { rpc_block_debounce: TimeLatch, attestation_delay_debounce: TimeLatch, lc_update_delay_debounce: TimeLatch, + data_column_delay_debounce: TimeLatch, next_backfill_batch_event: Option>>, slot_clock: Arc, } @@ -387,6 +412,13 @@ impl Stream for ReprocessQueue { Poll::Ready(None) | Poll::Pending => (), } + match self.data_columns_delay_queue.poll_expired(cx) { + Poll::Ready(Some(block_root)) => { + return Poll::Ready(Some(InboundEvent::ReadyDataColumn(block_root.into_inner()))); + } + Poll::Ready(None) | Poll::Pending => (), + } + if let Some(next_backfill_batch_event) = self.next_backfill_batch_event.as_mut() { match next_backfill_batch_event.as_mut().poll(cx) { Poll::Ready(_) => { @@ -455,6 +487,7 @@ impl ReprocessQueue { attestations_delay_queue: DelayQueue::new(), lc_updates_delay_queue: DelayQueue::new(), column_reconstructions_delay_queue: DelayQueue::new(), + data_columns_delay_queue: DelayQueue::new(), queued_gossip_block_roots: HashSet::new(), awaiting_envelopes_per_root: HashMap::new(), queued_lc_updates: FnvHashMap::default(), @@ -464,6 +497,8 @@ impl ReprocessQueue { awaiting_lc_updates_per_parent_root: HashMap::new(), queued_backfill_batches: Vec::new(), queued_column_reconstructions: HashMap::new(), + awaiting_data_columns_per_root: HashMap::new(), + queued_data_columns_count: 0, next_attestation: 0, next_lc_update: 0, early_block_debounce: TimeLatch::default(), @@ -471,6 +506,7 @@ impl ReprocessQueue { rpc_block_debounce: TimeLatch::default(), attestation_delay_debounce: TimeLatch::default(), lc_update_delay_debounce: TimeLatch::default(), + data_column_delay_debounce: TimeLatch::default(), next_backfill_batch_event: None, slot_clock, } @@ -551,22 +587,16 @@ impl ReprocessQueue { return; } - // When the queue is full, evict the oldest entry to make room for newer envelopes. + // When the queue is full, drop the new envelope. if self.awaiting_envelopes_per_root.len() >= MAXIMUM_QUEUED_ENVELOPES { if self.envelope_delay_debounce.elapsed() { warn!( queue_size = MAXIMUM_QUEUED_ENVELOPES, msg = "system resources may be saturated", - "Envelope delay queue is full, evicting oldest entry" + "Envelope delay queue is full, dropping envelope" ); } - if let Some(oldest_root) = - self.awaiting_envelopes_per_root.keys().next().copied() - && let Some((_envelope, delay_key)) = - self.awaiting_envelopes_per_root.remove(&oldest_root) - { - self.envelope_delay_queue.remove(&delay_key); - } + return; } // Register the timeout. @@ -688,6 +718,37 @@ impl ReprocessQueue { self.next_attestation += 1; } + InboundEvent::Msg(UnknownBlockDataColumn(queued_data_column)) => { + let block_root = queued_data_column.beacon_block_root; + + if self.queued_data_columns_count >= MAXIMUM_QUEUED_DATA_COLUMNS { + if self.data_column_delay_debounce.elapsed() { + warn!( + queue_size = MAXIMUM_QUEUED_DATA_COLUMNS, + msg = "system resources may be saturated", + "Data column delay queue is full, dropping column" + ); + } + return; + } + + if let Some((columns, _delay_key)) = + self.awaiting_data_columns_per_root.get_mut(&block_root) + { + // Append to existing entry; the timer for this root is already running. + columns.push(queued_data_column); + } else { + let delay_key = self.data_columns_delay_queue.insert( + block_root, + self.slot_clock.slot_duration() * QUEUED_DATA_COLUMN_DELAY_SLOTS, + ); + + self.awaiting_data_columns_per_root + .insert(block_root, (vec![queued_data_column], delay_key)); + } + + self.queued_data_columns_count += 1; + } InboundEvent::Msg(UnknownLightClientOptimisticUpdate( queued_light_client_optimistic_update, )) => { @@ -800,6 +861,25 @@ impl ReprocessQueue { ); } } + + // Unqueue the data columns we have for this root, if any. + if let Some((data_columns, delay_key)) = + self.awaiting_data_columns_per_root.remove(&block_root) + { + self.data_columns_delay_queue.remove(&delay_key); + self.queued_data_columns_count = self + .queued_data_columns_count + .saturating_sub(data_columns.len()); + for data_column in data_columns { + if self + .ready_work_tx + .try_send(ReadyWork::DataColumn(data_column)) + .is_err() + { + error!(?block_root, "Failed to send data column for reprocessing"); + } + } + } } InboundEvent::Msg(NewLightClientOptimisticUpdate { parent_root }) => { // Unqueue the light client optimistic updates we have for this root, if any. @@ -1053,6 +1133,27 @@ impl ReprocessQueue { ); } } + InboundEvent::ReadyDataColumn(block_root) => { + if let Some((data_columns, _)) = + self.awaiting_data_columns_per_root.remove(&block_root) + { + self.queued_data_columns_count = self + .queued_data_columns_count + .saturating_sub(data_columns.len()); + for data_column in data_columns { + if self + .ready_work_tx + .try_send(ReadyWork::DataColumn(data_column)) + .is_err() + { + error!( + hint = "system may be overloaded", + "Ignored expired gossip data column" + ); + } + } + } + } } metrics::set_gauge_vec( @@ -1581,48 +1682,87 @@ mod tests { assert_eq!(queue.envelope_delay_queue.len(), 1); } + /// Tests that a queued gossip data column is released when its block is imported. #[tokio::test] - async fn envelope_capacity_evicts_oldest() { + async fn data_column_released_on_block_imported() { + create_test_tracing_subscriber(); + + let config = BeaconProcessorConfig::default(); + let (ready_work_tx, mut ready_work_rx) = + mpsc::channel::(config.max_scheduled_work_queue_len); + let (_, reprocess_work_rx) = + mpsc::channel::(config.max_scheduled_work_queue_len); + let slot_clock = Arc::new(testing_slot_clock(12)); + let mut queue = ReprocessQueue::new(ready_work_tx, reprocess_work_rx, slot_clock); + + tokio::time::pause(); + + let beacon_block_root = Hash256::repeat_byte(0xbb); + + let msg = ReprocessQueueMessage::UnknownBlockDataColumn(QueuedGossipDataColumn { + beacon_block_root, + process_fn: Box::new(|| {}), + }); + queue.handle_message(InboundEvent::Msg(msg)); + + assert_eq!(queue.awaiting_data_columns_per_root.len(), 1); + assert!( + queue + .awaiting_data_columns_per_root + .contains_key(&beacon_block_root) + ); + assert_eq!(queue.data_columns_delay_queue.len(), 1); + + // Simulate block import. + queue.handle_message(InboundEvent::Msg(ReprocessQueueMessage::BlockImported { + block_root: beacon_block_root, + parent_root: Hash256::repeat_byte(0x00), + })); + + // Internal state should be cleaned up. + assert!(queue.awaiting_data_columns_per_root.is_empty()); + assert_eq!(queue.data_columns_delay_queue.len(), 0); + + // The column should have been sent to the ready_work channel. + let ready = ready_work_rx.try_recv().expect("column should be ready"); + assert!(matches!(ready, ReadyWork::DataColumn(_))); + } + + /// Tests that an expired gossip data column is pruned cleanly from all internal state. + #[tokio::test] + async fn prune_awaiting_data_columns_per_root() { create_test_tracing_subscriber(); let mut queue = test_queue(); - // Pause time so it only advances manually tokio::time::pause(); - // Fill the queue to capacity. - for i in 0..MAXIMUM_QUEUED_ENVELOPES { - let block_root = Hash256::repeat_byte(i as u8); - let msg = ReprocessQueueMessage::UnknownBlockForEnvelope(QueuedGossipEnvelope { - beacon_block_slot: Slot::new(1), - beacon_block_root: block_root, - process_fn: Box::pin(async {}), - }); - queue.handle_message(InboundEvent::Msg(msg)); - } - assert_eq!( - queue.awaiting_envelopes_per_root.len(), - MAXIMUM_QUEUED_ENVELOPES - ); + let beacon_block_root = Hash256::repeat_byte(0xcd); - // One more should evict the oldest and insert the new one. - let overflow_root = Hash256::repeat_byte(0xff); - let msg = ReprocessQueueMessage::UnknownBlockForEnvelope(QueuedGossipEnvelope { - beacon_block_slot: Slot::new(1), - beacon_block_root: overflow_root, - process_fn: Box::pin(async {}), + let msg = ReprocessQueueMessage::UnknownBlockDataColumn(QueuedGossipDataColumn { + beacon_block_root, + process_fn: Box::new(|| {}), }); queue.handle_message(InboundEvent::Msg(msg)); - // Queue should still be at capacity, with the new root present. - assert_eq!( - queue.awaiting_envelopes_per_root.len(), - MAXIMUM_QUEUED_ENVELOPES - ); + assert_eq!(queue.awaiting_data_columns_per_root.len(), 1); assert!( queue - .awaiting_envelopes_per_root - .contains_key(&overflow_root) + .awaiting_data_columns_per_root + .contains_key(&beacon_block_root) ); + + // Advance time past the delay so the entry expires. + advance_time( + &queue.slot_clock, + 2 * queue.slot_clock.slot_duration() * QUEUED_DATA_COLUMN_DELAY_SLOTS, + ) + .await; + let ready_msg = queue.next().await.unwrap(); + assert!(matches!(ready_msg, InboundEvent::ReadyDataColumn(_))); + queue.handle_message(ready_msg); + + // All internal state should be cleaned up. + assert!(queue.awaiting_data_columns_per_root.is_empty()); } } diff --git a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs index df94b473a8..9becfd4d59 100644 --- a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs @@ -61,8 +61,8 @@ use beacon_processor::work_reprocessing_queue::QueuedColumnReconstruction; use beacon_processor::{ DuplicateCache, GossipAggregatePackage, GossipAttestationBatch, work_reprocessing_queue::{ - QueuedAggregate, QueuedGossipBlock, QueuedGossipEnvelope, QueuedLightClientUpdate, - QueuedUnaggregate, ReprocessQueueMessage, + QueuedAggregate, QueuedGossipBlock, QueuedGossipDataColumn, QueuedGossipEnvelope, + QueuedLightClientUpdate, QueuedUnaggregate, ReprocessQueueMessage, }, }; @@ -657,6 +657,7 @@ impl NetworkBeaconProcessor { subnet_id: DataColumnSubnetId, column_sidecar: Arc>, seen_duration: Duration, + allow_reprocess: bool, ) { let slot = column_sidecar.slot(); let block_root = column_sidecar.block_root(); @@ -738,19 +739,48 @@ impl NetworkBeaconProcessor { .. } => { debug!( - action = "ignoring", + action = "queuing for reprocessing", %unknown_block_root, "Unknown block root for column" ); - // TODO(gloas): wire this into proper lookup sync. Sending - // `UnknownBlockHashFromAttestation` here is a Fulu-shaped fallback that - // mixes column processing with the attestation lookup path and is not - // the right primitive for Gloas column lookups. self.propagate_validation_result( - message_id, + message_id.clone(), peer_id, MessageAcceptance::Ignore, ); + + if allow_reprocess { + // Queue the column for reprocessing when the block arrives. + let processor = self.clone(); + let reprocess_msg = ReprocessQueueMessage::UnknownBlockDataColumn( + QueuedGossipDataColumn { + beacon_block_root: unknown_block_root, + process_fn: Box::new(move || { + let _ = processor.send_gossip_data_column_sidecar( + message_id, + peer_id, + subnet_id, + column_sidecar, + seen_duration, + false, // Do not reprocess this message again. + ); + }), + }, + ); + if self + .beacon_processor_send + .try_send(WorkEvent { + drop_during_sync: false, + work: Work::Reprocess(reprocess_msg), + }) + .is_err() + { + debug!( + %unknown_block_root, + "Failed to queue data column for reprocessing" + ); + } + } } GossipDataColumnError::InvalidVariant | GossipDataColumnError::PubkeyCacheTimeout diff --git a/beacon_node/network/src/network_beacon_processor/mod.rs b/beacon_node/network/src/network_beacon_processor/mod.rs index c2c8577046..f3c773eb25 100644 --- a/beacon_node/network/src/network_beacon_processor/mod.rs +++ b/beacon_node/network/src/network_beacon_processor/mod.rs @@ -201,6 +201,7 @@ impl NetworkBeaconProcessor { subnet_id: DataColumnSubnetId, column_sidecar: Arc>, seen_timestamp: Duration, + allow_reprocess: bool, ) -> Result<(), Error> { let processor = self.clone(); let process_fn = async move { @@ -211,6 +212,7 @@ impl NetworkBeaconProcessor { subnet_id, column_sidecar, seen_timestamp, + allow_reprocess, ) .await }; diff --git a/beacon_node/network/src/network_beacon_processor/tests.rs b/beacon_node/network/src/network_beacon_processor/tests.rs index c0b093e254..ad98851532 100644 --- a/beacon_node/network/src/network_beacon_processor/tests.rs +++ b/beacon_node/network/src/network_beacon_processor/tests.rs @@ -412,6 +412,7 @@ impl TestRig { DataColumnSubnetId::from_column_index(*data_column.index(), &self.chain.spec), data_column.clone(), Duration::from_secs(0), + true, ) .unwrap(); } diff --git a/beacon_node/network/src/router.rs b/beacon_node/network/src/router.rs index a8e5c9ae4a..277ece0aa8 100644 --- a/beacon_node/network/src/router.rs +++ b/beacon_node/network/src/router.rs @@ -422,6 +422,7 @@ impl Router { subnet_id, column_sidecar, seen_timestamp, + true, ), ) }