diff --git a/beacon_node/beacon_processor/src/lib.rs b/beacon_node/beacon_processor/src/lib.rs index af3ff09c8a..d6233ebaf9 100644 --- a/beacon_node/beacon_processor/src/lib.rs +++ b/beacon_node/beacon_processor/src/lib.rs @@ -41,8 +41,8 @@ pub use crate::scheduler::BeaconProcessorQueueLengths; use crate::scheduler::work_queue::WorkQueues; use crate::work_reprocessing_queue::{ - QueuedBackfillBatch, QueuedColumnReconstruction, QueuedGossipBlock, QueuedGossipEnvelope, - ReprocessQueueMessage, + QueuedBackfillBatch, QueuedColumnReconstruction, QueuedGossipBlock, QueuedGossipDataColumn, + QueuedGossipEnvelope, ReprocessQueueMessage, }; use futures::stream::{Stream, StreamExt}; use futures::task::Poll; @@ -304,6 +304,10 @@ impl From for WorkEvent { work: Work::ColumnReconstruction(process_fn), } } + ReadyWork::DataColumn(QueuedGossipDataColumn { process_fn, .. }) => Self { + drop_during_sync: true, + work: Work::UnknownBlockDataColumn { process_fn }, + }, } } } @@ -369,6 +373,9 @@ pub enum Work { UnknownBlockAttestation { process_fn: BlockingFn, }, + UnknownBlockDataColumn { + process_fn: BlockingFn, + }, GossipAttestationBatch { attestations: GossipAttestationBatch, process_batch: Box, @@ -464,6 +471,7 @@ pub enum WorkType { GossipAttestation, GossipAttestationToConvert, UnknownBlockAttestation, + UnknownBlockDataColumn, GossipAttestationBatch, GossipAggregate, UnknownBlockAggregate, @@ -569,6 +577,7 @@ impl Work { Work::LightClientFinalityUpdateRequest(_) => WorkType::LightClientFinalityUpdateRequest, Work::LightClientUpdatesByRangeRequest(_) => WorkType::LightClientUpdatesByRangeRequest, Work::UnknownBlockAttestation { .. } => WorkType::UnknownBlockAttestation, + Work::UnknownBlockDataColumn { .. } => WorkType::UnknownBlockDataColumn, Work::UnknownBlockAggregate { .. } => WorkType::UnknownBlockAggregate, Work::UnknownLightClientOptimisticUpdate { .. } => { WorkType::UnknownLightClientOptimisticUpdate @@ -842,6 +851,9 @@ impl BeaconProcessor { Some(item) } else if let Some(item) = work_queues.gossip_data_column_queue.pop() { Some(item) + } else if let Some(item) = work_queues.unknown_block_data_column_queue.pop() + { + Some(item) } else if let Some(item) = work_queues.gossip_partial_data_column_queue.pop() { @@ -1238,6 +1250,9 @@ impl BeaconProcessor { Work::UnknownBlockAttestation { .. } => { work_queues.unknown_block_attestation_queue.push(work) } + Work::UnknownBlockDataColumn { .. } => work_queues + .unknown_block_data_column_queue + .push(work, work_id), Work::UnknownBlockAggregate { .. } => { work_queues.unknown_block_aggregate_queue.push(work) } @@ -1288,6 +1303,9 @@ impl BeaconProcessor { WorkType::UnknownBlockAttestation => { work_queues.unknown_block_attestation_queue.len() } + WorkType::UnknownBlockDataColumn => { + work_queues.unknown_block_data_column_queue.len() + } WorkType::GossipAttestationBatch => 0, // No queue WorkType::GossipAggregate => work_queues.aggregate_queue.len(), WorkType::UnknownBlockAggregate => { @@ -1504,6 +1522,7 @@ impl BeaconProcessor { }), Work::UnknownBlockAttestation { process_fn } | Work::UnknownBlockAggregate { process_fn } + | Work::UnknownBlockDataColumn { process_fn } | Work::UnknownLightClientOptimisticUpdate { process_fn, .. } => { task_spawner.spawn_blocking(process_fn) } diff --git a/beacon_node/beacon_processor/src/scheduler/work_queue.rs b/beacon_node/beacon_processor/src/scheduler/work_queue.rs index ebd66e743d..cc03feac51 100644 --- a/beacon_node/beacon_processor/src/scheduler/work_queue.rs +++ b/beacon_node/beacon_processor/src/scheduler/work_queue.rs @@ -111,6 +111,7 @@ pub struct BeaconProcessorQueueLengths { attestation_queue: usize, unknown_block_aggregate_queue: usize, unknown_block_attestation_queue: usize, + unknown_block_data_column_queue: usize, sync_message_queue: usize, sync_contribution_queue: usize, gossip_voluntary_exit_queue: usize, @@ -174,6 +175,8 @@ impl BeaconProcessorQueueLengths { Ok(Self { aggregate_queue: 4096, unknown_block_aggregate_queue: 1024, + // Capacity for two slot's worth of data columns for a supernode. + unknown_block_data_column_queue: 256, // Capacity for a full slot's worth of attestations if subscribed to all subnets attestation_queue: std::cmp::max( active_validator_count / slots_per_epoch, @@ -245,6 +248,7 @@ pub struct WorkQueues { pub attestation_debounce: TimeLatch, pub unknown_block_aggregate_queue: LifoQueue>, pub unknown_block_attestation_queue: LifoQueue>, + pub unknown_block_data_column_queue: FifoQueue>, pub sync_message_queue: LifoQueue>, pub sync_contribution_queue: LifoQueue>, pub gossip_voluntary_exit_queue: FifoQueue>, @@ -302,6 +306,8 @@ impl WorkQueues { LifoQueue::new(queue_lengths.unknown_block_aggregate_queue); let unknown_block_attestation_queue = LifoQueue::new(queue_lengths.unknown_block_attestation_queue); + let unknown_block_data_column_queue = + FifoQueue::new(queue_lengths.unknown_block_data_column_queue); let sync_message_queue = LifoQueue::new(queue_lengths.sync_message_queue); let sync_contribution_queue = LifoQueue::new(queue_lengths.sync_contribution_queue); @@ -383,6 +389,7 @@ impl WorkQueues { attestation_debounce, unknown_block_aggregate_queue, unknown_block_attestation_queue, + unknown_block_data_column_queue, sync_message_queue, sync_contribution_queue, gossip_voluntary_exit_queue, diff --git a/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs b/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs index b1fa56af01..62ed86fbad 100644 --- a/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs +++ b/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs @@ -52,6 +52,10 @@ pub const QUEUED_ATTESTATION_DELAY: Duration = Duration::from_secs(12); /// For how long to queue light client updates for re-processing. pub const QUEUED_LIGHT_CLIENT_UPDATE_DELAY: Duration = Duration::from_secs(12); +/// Data column timeout as a multiplier of slot duration. Columns waiting for their block will be +/// sent for processing after this many slots worth of time, even if the block hasn't arrived. +const QUEUED_DATA_COLUMN_DELAY_SLOTS: u32 = 1; + /// Envelope timeout as a multiplier of slot duration. Envelopes waiting for their block will be /// sent for processing after this many slots worth of time, even if the block hasn't arrived. const QUEUED_ENVELOPE_DELAY_SLOTS: u32 = 1; @@ -76,6 +80,9 @@ const MAXIMUM_QUEUED_ENVELOPES: usize = 16; /// How many attestations we keep before new ones get dropped. const MAXIMUM_QUEUED_ATTESTATIONS: usize = 16_384; +/// How many columns we keep before new ones get dropped. +const MAXIMUM_QUEUED_DATA_COLUMNS: usize = 256; + /// How many light client updates we keep before new ones get dropped. const MAXIMUM_QUEUED_LIGHT_CLIENT_UPDATES: usize = 128; @@ -123,6 +130,8 @@ pub enum ReprocessQueueMessage { UnknownLightClientOptimisticUpdate(QueuedLightClientUpdate), /// A new backfill batch that needs to be scheduled for processing. BackfillSync(QueuedBackfillBatch), + /// A gossip data column that references an unknown block. + UnknownBlockDataColumn(QueuedGossipDataColumn), /// A delayed column reconstruction that needs checking DelayColumnReconstruction(QueuedColumnReconstruction), } @@ -138,6 +147,7 @@ pub enum ReadyWork { LightClientUpdate(QueuedLightClientUpdate), BackfillSync(QueuedBackfillBatch), ColumnReconstruction(QueuedColumnReconstruction), + DataColumn(QueuedGossipDataColumn), } /// An Attestation for which the corresponding block was not seen while processing, queued for @@ -200,6 +210,12 @@ pub struct QueuedColumnReconstruction { pub process_fn: AsyncFn, } +/// A gossip data column that references an unknown block, queued for later reprocessing. +pub struct QueuedGossipDataColumn { + pub beacon_block_root: Hash256, + pub process_fn: BlockingFn, +} + impl TryFrom> for QueuedBackfillBatch { type Error = WorkEvent; @@ -240,6 +256,8 @@ enum InboundEvent { ReadyBackfillSync(QueuedBackfillBatch), /// A column reconstruction that was queued is ready for processing. ReadyColumnReconstruction(QueuedColumnReconstruction), + /// A gossip data column that is ready for re-processing. + ReadyDataColumn(Hash256), /// A message sent to the `ReprocessQueue` Msg(ReprocessQueueMessage), } @@ -264,6 +282,8 @@ struct ReprocessQueue { lc_updates_delay_queue: DelayQueue, /// Queue to manage scheduled column reconstructions. column_reconstructions_delay_queue: DelayQueue, + /// Queue to manage gossip data column timeouts. + data_columns_delay_queue: DelayQueue, /* Queued items */ /// Queued blocks. @@ -284,6 +304,10 @@ struct ReprocessQueue { queued_column_reconstructions: HashMap>, /// Queued backfill batches queued_backfill_batches: Vec, + /// Queued gossip data columns awaiting their block, keyed by block root. + awaiting_data_columns_per_root: HashMap, DelayKey)>, + /// Total number of queued gossip data columns across all roots. + queued_data_columns_count: usize, /* Aux */ /// Next attestation id, used for both aggregated and unaggregated attestations @@ -294,6 +318,7 @@ struct ReprocessQueue { rpc_block_debounce: TimeLatch, attestation_delay_debounce: TimeLatch, lc_update_delay_debounce: TimeLatch, + data_column_delay_debounce: TimeLatch, next_backfill_batch_event: Option>>, slot_clock: Arc, } @@ -387,6 +412,13 @@ impl Stream for ReprocessQueue { Poll::Ready(None) | Poll::Pending => (), } + match self.data_columns_delay_queue.poll_expired(cx) { + Poll::Ready(Some(block_root)) => { + return Poll::Ready(Some(InboundEvent::ReadyDataColumn(block_root.into_inner()))); + } + Poll::Ready(None) | Poll::Pending => (), + } + if let Some(next_backfill_batch_event) = self.next_backfill_batch_event.as_mut() { match next_backfill_batch_event.as_mut().poll(cx) { Poll::Ready(_) => { @@ -455,6 +487,7 @@ impl ReprocessQueue { attestations_delay_queue: DelayQueue::new(), lc_updates_delay_queue: DelayQueue::new(), column_reconstructions_delay_queue: DelayQueue::new(), + data_columns_delay_queue: DelayQueue::new(), queued_gossip_block_roots: HashSet::new(), awaiting_envelopes_per_root: HashMap::new(), queued_lc_updates: FnvHashMap::default(), @@ -464,6 +497,8 @@ impl ReprocessQueue { awaiting_lc_updates_per_parent_root: HashMap::new(), queued_backfill_batches: Vec::new(), queued_column_reconstructions: HashMap::new(), + awaiting_data_columns_per_root: HashMap::new(), + queued_data_columns_count: 0, next_attestation: 0, next_lc_update: 0, early_block_debounce: TimeLatch::default(), @@ -471,6 +506,7 @@ impl ReprocessQueue { rpc_block_debounce: TimeLatch::default(), attestation_delay_debounce: TimeLatch::default(), lc_update_delay_debounce: TimeLatch::default(), + data_column_delay_debounce: TimeLatch::default(), next_backfill_batch_event: None, slot_clock, } @@ -551,22 +587,16 @@ impl ReprocessQueue { return; } - // When the queue is full, evict the oldest entry to make room for newer envelopes. + // When the queue is full, drop the new envelope. if self.awaiting_envelopes_per_root.len() >= MAXIMUM_QUEUED_ENVELOPES { if self.envelope_delay_debounce.elapsed() { warn!( queue_size = MAXIMUM_QUEUED_ENVELOPES, msg = "system resources may be saturated", - "Envelope delay queue is full, evicting oldest entry" + "Envelope delay queue is full, dropping envelope" ); } - if let Some(oldest_root) = - self.awaiting_envelopes_per_root.keys().next().copied() - && let Some((_envelope, delay_key)) = - self.awaiting_envelopes_per_root.remove(&oldest_root) - { - self.envelope_delay_queue.remove(&delay_key); - } + return; } // Register the timeout. @@ -688,6 +718,37 @@ impl ReprocessQueue { self.next_attestation += 1; } + InboundEvent::Msg(UnknownBlockDataColumn(queued_data_column)) => { + let block_root = queued_data_column.beacon_block_root; + + if self.queued_data_columns_count >= MAXIMUM_QUEUED_DATA_COLUMNS { + if self.data_column_delay_debounce.elapsed() { + warn!( + queue_size = MAXIMUM_QUEUED_DATA_COLUMNS, + msg = "system resources may be saturated", + "Data column delay queue is full, dropping column" + ); + } + return; + } + + if let Some((columns, _delay_key)) = + self.awaiting_data_columns_per_root.get_mut(&block_root) + { + // Append to existing entry; the timer for this root is already running. + columns.push(queued_data_column); + } else { + let delay_key = self.data_columns_delay_queue.insert( + block_root, + self.slot_clock.slot_duration() * QUEUED_DATA_COLUMN_DELAY_SLOTS, + ); + + self.awaiting_data_columns_per_root + .insert(block_root, (vec![queued_data_column], delay_key)); + } + + self.queued_data_columns_count += 1; + } InboundEvent::Msg(UnknownLightClientOptimisticUpdate( queued_light_client_optimistic_update, )) => { @@ -800,6 +861,25 @@ impl ReprocessQueue { ); } } + + // Unqueue the data columns we have for this root, if any. + if let Some((data_columns, delay_key)) = + self.awaiting_data_columns_per_root.remove(&block_root) + { + self.data_columns_delay_queue.remove(&delay_key); + self.queued_data_columns_count = self + .queued_data_columns_count + .saturating_sub(data_columns.len()); + for data_column in data_columns { + if self + .ready_work_tx + .try_send(ReadyWork::DataColumn(data_column)) + .is_err() + { + error!(?block_root, "Failed to send data column for reprocessing"); + } + } + } } InboundEvent::Msg(NewLightClientOptimisticUpdate { parent_root }) => { // Unqueue the light client optimistic updates we have for this root, if any. @@ -1053,6 +1133,27 @@ impl ReprocessQueue { ); } } + InboundEvent::ReadyDataColumn(block_root) => { + if let Some((data_columns, _)) = + self.awaiting_data_columns_per_root.remove(&block_root) + { + self.queued_data_columns_count = self + .queued_data_columns_count + .saturating_sub(data_columns.len()); + for data_column in data_columns { + if self + .ready_work_tx + .try_send(ReadyWork::DataColumn(data_column)) + .is_err() + { + error!( + hint = "system may be overloaded", + "Ignored expired gossip data column" + ); + } + } + } + } } metrics::set_gauge_vec( @@ -1581,48 +1682,87 @@ mod tests { assert_eq!(queue.envelope_delay_queue.len(), 1); } + /// Tests that a queued gossip data column is released when its block is imported. #[tokio::test] - async fn envelope_capacity_evicts_oldest() { + async fn data_column_released_on_block_imported() { + create_test_tracing_subscriber(); + + let config = BeaconProcessorConfig::default(); + let (ready_work_tx, mut ready_work_rx) = + mpsc::channel::(config.max_scheduled_work_queue_len); + let (_, reprocess_work_rx) = + mpsc::channel::(config.max_scheduled_work_queue_len); + let slot_clock = Arc::new(testing_slot_clock(12)); + let mut queue = ReprocessQueue::new(ready_work_tx, reprocess_work_rx, slot_clock); + + tokio::time::pause(); + + let beacon_block_root = Hash256::repeat_byte(0xbb); + + let msg = ReprocessQueueMessage::UnknownBlockDataColumn(QueuedGossipDataColumn { + beacon_block_root, + process_fn: Box::new(|| {}), + }); + queue.handle_message(InboundEvent::Msg(msg)); + + assert_eq!(queue.awaiting_data_columns_per_root.len(), 1); + assert!( + queue + .awaiting_data_columns_per_root + .contains_key(&beacon_block_root) + ); + assert_eq!(queue.data_columns_delay_queue.len(), 1); + + // Simulate block import. + queue.handle_message(InboundEvent::Msg(ReprocessQueueMessage::BlockImported { + block_root: beacon_block_root, + parent_root: Hash256::repeat_byte(0x00), + })); + + // Internal state should be cleaned up. + assert!(queue.awaiting_data_columns_per_root.is_empty()); + assert_eq!(queue.data_columns_delay_queue.len(), 0); + + // The column should have been sent to the ready_work channel. + let ready = ready_work_rx.try_recv().expect("column should be ready"); + assert!(matches!(ready, ReadyWork::DataColumn(_))); + } + + /// Tests that an expired gossip data column is pruned cleanly from all internal state. + #[tokio::test] + async fn prune_awaiting_data_columns_per_root() { create_test_tracing_subscriber(); let mut queue = test_queue(); - // Pause time so it only advances manually tokio::time::pause(); - // Fill the queue to capacity. - for i in 0..MAXIMUM_QUEUED_ENVELOPES { - let block_root = Hash256::repeat_byte(i as u8); - let msg = ReprocessQueueMessage::UnknownBlockForEnvelope(QueuedGossipEnvelope { - beacon_block_slot: Slot::new(1), - beacon_block_root: block_root, - process_fn: Box::pin(async {}), - }); - queue.handle_message(InboundEvent::Msg(msg)); - } - assert_eq!( - queue.awaiting_envelopes_per_root.len(), - MAXIMUM_QUEUED_ENVELOPES - ); + let beacon_block_root = Hash256::repeat_byte(0xcd); - // One more should evict the oldest and insert the new one. - let overflow_root = Hash256::repeat_byte(0xff); - let msg = ReprocessQueueMessage::UnknownBlockForEnvelope(QueuedGossipEnvelope { - beacon_block_slot: Slot::new(1), - beacon_block_root: overflow_root, - process_fn: Box::pin(async {}), + let msg = ReprocessQueueMessage::UnknownBlockDataColumn(QueuedGossipDataColumn { + beacon_block_root, + process_fn: Box::new(|| {}), }); queue.handle_message(InboundEvent::Msg(msg)); - // Queue should still be at capacity, with the new root present. - assert_eq!( - queue.awaiting_envelopes_per_root.len(), - MAXIMUM_QUEUED_ENVELOPES - ); + assert_eq!(queue.awaiting_data_columns_per_root.len(), 1); assert!( queue - .awaiting_envelopes_per_root - .contains_key(&overflow_root) + .awaiting_data_columns_per_root + .contains_key(&beacon_block_root) ); + + // Advance time past the delay so the entry expires. + advance_time( + &queue.slot_clock, + 2 * queue.slot_clock.slot_duration() * QUEUED_DATA_COLUMN_DELAY_SLOTS, + ) + .await; + let ready_msg = queue.next().await.unwrap(); + assert!(matches!(ready_msg, InboundEvent::ReadyDataColumn(_))); + queue.handle_message(ready_msg); + + // All internal state should be cleaned up. + assert!(queue.awaiting_data_columns_per_root.is_empty()); } } diff --git a/beacon_node/lighthouse_network/src/config.rs b/beacon_node/lighthouse_network/src/config.rs index db42d0cfa8..4d4d91a456 100644 --- a/beacon_node/lighthouse_network/src/config.rs +++ b/beacon_node/lighthouse_network/src/config.rs @@ -125,6 +125,9 @@ pub struct Config { /// Whether light client protocols should be enabled. pub enable_light_client_server: bool, + /// Whether to enable the deprecated mplex multiplexer alongside yamux. + pub enable_mplex: bool, + /// Configuration for the outbound rate limiter (requests made by this node). pub outbound_rate_limiter_config: Option, @@ -362,6 +365,7 @@ impl Default for Config { proposer_only: false, metrics_enabled: false, enable_light_client_server: true, + enable_mplex: false, outbound_rate_limiter_config: None, invalid_block_storage: None, inbound_rate_limiter_config: None, diff --git a/beacon_node/lighthouse_network/src/service/mod.rs b/beacon_node/lighthouse_network/src/service/mod.rs index 41d937e324..f5e2442f86 100644 --- a/beacon_node/lighthouse_network/src/service/mod.rs +++ b/beacon_node/lighthouse_network/src/service/mod.rs @@ -466,9 +466,13 @@ impl Network { } }; - // Set up the transport - tcp/quic with noise and mplex - let transport = build_transport(local_keypair.clone(), !config.disable_quic_support) - .map_err(|e| format!("Failed to build transport: {:?}", e))?; + // Set up the transport - tcp/quic with noise and yamux (mplex optional) + let transport = build_transport( + local_keypair.clone(), + !config.disable_quic_support, + config.enable_mplex, + ) + .map_err(|e| format!("Failed to build transport: {:?}", e))?; // use the executor for libp2p struct Executor(task_executor::TaskExecutor); diff --git a/beacon_node/lighthouse_network/src/service/utils.rs b/beacon_node/lighthouse_network/src/service/utils.rs index c7dabcb391..47629f4fd3 100644 --- a/beacon_node/lighthouse_network/src/service/utils.rs +++ b/beacon_node/lighthouse_network/src/service/utils.rs @@ -34,27 +34,39 @@ pub struct Context<'a> { type BoxedTransport = Boxed<(PeerId, StreamMuxerBox)>; /// The implementation supports TCP/IP, QUIC (experimental) over UDP, noise as the encryption layer, and -/// mplex/yamux as the multiplexing layer (when using TCP). +/// yamux as the multiplexing layer (when using TCP). Mplex can be optionally enabled. pub fn build_transport( local_private_key: Keypair, quic_support: bool, + enable_mplex: bool, ) -> std::io::Result { - // mplex config - let mut mplex_config = libp2p_mplex::Config::new(); - mplex_config.set_max_buffer_size(256); - mplex_config.set_max_buffer_behaviour(libp2p_mplex::MaxBufferBehaviour::Block); - // yamux config let yamux_config = yamux::Config::default(); + // Creates the TCP transport layer - let tcp = libp2p::tcp::tokio::Transport::new(libp2p::tcp::Config::default().nodelay(true)) - .upgrade(core::upgrade::Version::V1) - .authenticate(generate_noise_config(&local_private_key)) - .multiplex(core::upgrade::SelectUpgrade::new( - yamux_config, - mplex_config, - )) - .timeout(Duration::from_secs(10)); + let tcp: BoxedTransport = if enable_mplex { + // Enable both yamux and mplex. + let mut mplex_config = libp2p_mplex::Config::new(); + mplex_config.set_max_num_streams(32); + mplex_config.set_max_buffer_behaviour(libp2p_mplex::MaxBufferBehaviour::ResetStream); + libp2p::tcp::tokio::Transport::new(libp2p::tcp::Config::default().nodelay(true)) + .upgrade(core::upgrade::Version::V1) + .authenticate(generate_noise_config(&local_private_key)) + .multiplex(core::upgrade::SelectUpgrade::new( + yamux_config, + mplex_config, + )) + .timeout(Duration::from_secs(10)) + .boxed() + } else { + // Yamux only + libp2p::tcp::tokio::Transport::new(libp2p::tcp::Config::default().nodelay(true)) + .upgrade(core::upgrade::Version::V1) + .authenticate(generate_noise_config(&local_private_key)) + .multiplex(yamux_config) + .timeout(Duration::from_secs(10)) + .boxed() + }; let transport = if quic_support { // Enables Quic // The default quic configuration suits us for now. diff --git a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs index 0dd94ea9f8..b76f144887 100644 --- a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs @@ -61,8 +61,8 @@ use beacon_processor::work_reprocessing_queue::QueuedColumnReconstruction; use beacon_processor::{ DuplicateCache, GossipAggregatePackage, GossipAttestationBatch, work_reprocessing_queue::{ - QueuedAggregate, QueuedGossipBlock, QueuedGossipEnvelope, QueuedLightClientUpdate, - QueuedUnaggregate, ReprocessQueueMessage, + QueuedAggregate, QueuedGossipBlock, QueuedGossipDataColumn, QueuedGossipEnvelope, + QueuedLightClientUpdate, QueuedUnaggregate, ReprocessQueueMessage, }, }; @@ -657,6 +657,7 @@ impl NetworkBeaconProcessor { subnet_id: DataColumnSubnetId, column_sidecar: Arc>, seen_duration: Duration, + allow_reprocess: bool, ) { let slot = column_sidecar.slot(); let block_root = column_sidecar.block_root(); @@ -719,36 +720,67 @@ impl NetworkBeaconProcessor { MessageAcceptance::Accept, ); } - GossipDataColumnError::ParentUnknown { parent_root, .. } => { + GossipDataColumnError::ParentUnknown { parent_root, slot } => { debug!( action = "requesting parent", %block_root, %parent_root, "Unknown parent hash for column" ); - self.send_sync_message(SyncMessage::UnknownParentDataColumn( + self.send_sync_message(SyncMessage::UnknownParentSidecarHeader { peer_id, - column_sidecar, - )); + block_root, + parent_root, + slot, + }); } GossipDataColumnError::BlockRootUnknown { block_root: unknown_block_root, .. } => { debug!( - action = "ignoring", + action = "queuing for reprocessing", %unknown_block_root, "Unknown block root for column" ); - // TODO(gloas): wire this into proper lookup sync. Sending - // `UnknownBlockHashFromAttestation` here is a Fulu-shaped fallback that - // mixes column processing with the attestation lookup path and is not - // the right primitive for Gloas column lookups. self.propagate_validation_result( - message_id, + message_id.clone(), peer_id, MessageAcceptance::Ignore, ); + + if allow_reprocess { + // Queue the column for reprocessing when the block arrives. + let processor = self.clone(); + let reprocess_msg = ReprocessQueueMessage::UnknownBlockDataColumn( + QueuedGossipDataColumn { + beacon_block_root: unknown_block_root, + process_fn: Box::new(move || { + let _ = processor.send_gossip_data_column_sidecar( + message_id, + peer_id, + subnet_id, + column_sidecar, + seen_duration, + false, // Do not reprocess this message again. + ); + }), + }, + ); + if self + .beacon_processor_send + .try_send(WorkEvent { + drop_during_sync: false, + work: Work::Reprocess(reprocess_msg), + }) + .is_err() + { + debug!( + %unknown_block_root, + "Failed to queue data column for reprocessing" + ); + } + } } GossipDataColumnError::InvalidVariant | GossipDataColumnError::PubkeyCacheTimeout @@ -1047,7 +1079,7 @@ impl NetworkBeaconProcessor { %parent_root, "Unknown parent hash for partial column" ); - self.send_sync_message(SyncMessage::UnknownParentPartialDataColumn { + self.send_sync_message(SyncMessage::UnknownParentSidecarHeader { peer_id, block_root, parent_root, diff --git a/beacon_node/network/src/network_beacon_processor/mod.rs b/beacon_node/network/src/network_beacon_processor/mod.rs index c2c8577046..f3c773eb25 100644 --- a/beacon_node/network/src/network_beacon_processor/mod.rs +++ b/beacon_node/network/src/network_beacon_processor/mod.rs @@ -201,6 +201,7 @@ impl NetworkBeaconProcessor { subnet_id: DataColumnSubnetId, column_sidecar: Arc>, seen_timestamp: Duration, + allow_reprocess: bool, ) -> Result<(), Error> { let processor = self.clone(); let process_fn = async move { @@ -211,6 +212,7 @@ impl NetworkBeaconProcessor { subnet_id, column_sidecar, seen_timestamp, + allow_reprocess, ) .await }; diff --git a/beacon_node/network/src/network_beacon_processor/tests.rs b/beacon_node/network/src/network_beacon_processor/tests.rs index c0b093e254..ad98851532 100644 --- a/beacon_node/network/src/network_beacon_processor/tests.rs +++ b/beacon_node/network/src/network_beacon_processor/tests.rs @@ -412,6 +412,7 @@ impl TestRig { DataColumnSubnetId::from_column_index(*data_column.index(), &self.chain.spec), data_column.clone(), Duration::from_secs(0), + true, ) .unwrap(); } diff --git a/beacon_node/network/src/router.rs b/beacon_node/network/src/router.rs index a8e5c9ae4a..277ece0aa8 100644 --- a/beacon_node/network/src/router.rs +++ b/beacon_node/network/src/router.rs @@ -422,6 +422,7 @@ impl Router { subnet_id, column_sidecar, seen_timestamp, + true, ), ) } diff --git a/beacon_node/network/src/sync/block_lookups/common.rs b/beacon_node/network/src/sync/block_lookups/common.rs deleted file mode 100644 index 4306458615..0000000000 --- a/beacon_node/network/src/sync/block_lookups/common.rs +++ /dev/null @@ -1,164 +0,0 @@ -use crate::sync::block_lookups::single_block_lookup::{ - LookupRequestError, SingleBlockLookup, SingleLookupRequestState, -}; -use crate::sync::block_lookups::{BlockRequestState, CustodyRequestState, PeerId}; -use crate::sync::manager::BlockProcessType; -use crate::sync::network_context::{LookupRequestResult, SyncNetworkContext}; -use beacon_chain::BeaconChainTypes; -use lighthouse_network::service::api_types::Id; -use parking_lot::RwLock; -use std::collections::HashSet; -use std::sync::Arc; -use types::{DataColumnSidecarList, SignedBeaconBlock}; - -use super::SingleLookupId; -use super::single_block_lookup::{ComponentRequests, DownloadResult}; - -#[derive(Debug, Copy, Clone)] -pub enum ResponseType { - Block, - CustodyColumn, -} - -/// This trait unifies common single block lookup functionality across blocks and data columns. -/// This includes making requests, verifying responses, and handling processing results. A -/// `SingleBlockLookup` includes both a `BlockRequestState` and a `CustodyRequestState`, this trait -/// is implemented for each. -/// -/// The use of the `ResponseType` associated type gives us a degree of type -/// safety when handling a block/column response ensuring we only mutate the correct corresponding -/// state. -pub trait RequestState { - /// The type created after validation. - type VerifiedResponseType: Clone; - - /// Request the network context to prepare a request of a component of `block_root`. If the - /// request is not necessary because the component is already known / processed, return false. - /// Return true if it sent a request and we can expect an event back from the network. - fn make_request( - &self, - id: Id, - lookup_peers: Arc>>, - expected_blobs: usize, - cx: &mut SyncNetworkContext, - ) -> Result; - - /* Response handling methods */ - - /// Send the response to the beacon processor. - fn send_for_processing( - id: Id, - result: DownloadResult, - cx: &SyncNetworkContext, - ) -> Result<(), LookupRequestError>; - - /* Utility methods */ - - /// Returns the `ResponseType` associated with this trait implementation. Useful in logging. - fn response_type() -> ResponseType; - - /// A getter for the `BlockRequestState` or `CustodyRequestState` associated with this trait. - fn request_state_mut(request: &mut SingleBlockLookup) -> Result<&mut Self, &'static str>; - - /// A getter for a reference to the `SingleLookupRequestState` associated with this trait. - fn get_state(&self) -> &SingleLookupRequestState; - - /// A getter for a mutable reference to the SingleLookupRequestState associated with this trait. - fn get_state_mut(&mut self) -> &mut SingleLookupRequestState; -} - -impl RequestState for BlockRequestState { - type VerifiedResponseType = Arc>; - - fn make_request( - &self, - id: SingleLookupId, - lookup_peers: Arc>>, - _: usize, - cx: &mut SyncNetworkContext, - ) -> Result { - cx.block_lookup_request(id, lookup_peers, self.requested_block_root) - .map_err(LookupRequestError::SendFailedNetwork) - } - - fn send_for_processing( - id: SingleLookupId, - download_result: DownloadResult, - cx: &SyncNetworkContext, - ) -> Result<(), LookupRequestError> { - let DownloadResult { - value, - block_root, - seen_timestamp, - .. - } = download_result; - cx.send_block_for_processing(id, block_root, value, seen_timestamp) - .map_err(LookupRequestError::SendFailedProcessor) - } - - fn response_type() -> ResponseType { - ResponseType::Block - } - fn request_state_mut(request: &mut SingleBlockLookup) -> Result<&mut Self, &'static str> { - Ok(&mut request.block_request_state) - } - fn get_state(&self) -> &SingleLookupRequestState { - &self.state - } - fn get_state_mut(&mut self) -> &mut SingleLookupRequestState { - &mut self.state - } -} - -impl RequestState for CustodyRequestState { - type VerifiedResponseType = DataColumnSidecarList; - - fn make_request( - &self, - id: Id, - lookup_peers: Arc>>, - _: usize, - cx: &mut SyncNetworkContext, - ) -> Result { - cx.custody_lookup_request(id, self.block_root, self.slot, lookup_peers) - .map_err(LookupRequestError::SendFailedNetwork) - } - - fn send_for_processing( - id: Id, - download_result: DownloadResult, - cx: &SyncNetworkContext, - ) -> Result<(), LookupRequestError> { - let DownloadResult { - value, - block_root, - seen_timestamp, - .. - } = download_result; - cx.send_custody_columns_for_processing( - id, - block_root, - value, - seen_timestamp, - BlockProcessType::SingleCustodyColumn(id), - ) - .map_err(LookupRequestError::SendFailedProcessor) - } - - fn response_type() -> ResponseType { - ResponseType::CustodyColumn - } - fn request_state_mut(request: &mut SingleBlockLookup) -> Result<&mut Self, &'static str> { - match &mut request.component_requests { - ComponentRequests::WaitingForBlock => Err("waiting for block"), - ComponentRequests::ActiveCustodyRequest(request) => Ok(request), - ComponentRequests::NotNeeded { .. } => Err("not needed"), - } - } - fn get_state(&self) -> &SingleLookupRequestState { - &self.state - } - fn get_state_mut(&mut self) -> &mut SingleLookupRequestState { - &mut self.state - } -} diff --git a/beacon_node/network/src/sync/block_lookups/mod.rs b/beacon_node/network/src/sync/block_lookups/mod.rs index ecaee7c0ec..a265373e3f 100644 --- a/beacon_node/network/src/sync/block_lookups/mod.rs +++ b/beacon_node/network/src/sync/block_lookups/mod.rs @@ -24,27 +24,23 @@ use self::parent_chain::{NodeChain, compute_parent_chains}; pub use self::single_block_lookup::DownloadResult; use self::single_block_lookup::{LookupRequestError, LookupResult, SingleBlockLookup}; use super::manager::{BlockProcessType, SLOT_IMPORT_TOLERANCE}; -use super::network_context::{PeerGroup, RpcResponseError, SyncNetworkContext}; +use super::network_context::{RpcResponseError, SyncNetworkContext}; use crate::metrics; use crate::network_beacon_processor::BlockProcessingResult; use crate::sync::SyncMessage; use crate::sync::block_lookups::parent_chain::find_oldest_fork_ancestor; use beacon_chain::BeaconChainTypes; -use beacon_chain::block_verification_types::AsBlock; -pub use common::RequestState; use fnv::FnvHashMap; use lighthouse_network::PeerId; use lighthouse_network::service::api_types::SingleLookupReqId; use lru_cache::LRUTimeCache; -pub use single_block_lookup::{BlockRequestState, CustodyRequestState}; use std::collections::hash_map::Entry; use std::sync::Arc; use std::time::Duration; use store::Hash256; use tracing::{debug, error, warn}; -use types::{EthSpec, SignedBeaconBlock}; +use types::{DataColumnSidecarList, EthSpec, SignedBeaconBlock}; -pub mod common; pub mod parent_chain; mod single_block_lookup; @@ -74,38 +70,17 @@ const LOOKUP_MAX_DURATION_NO_PEERS_SECS: u64 = 10; /// take at most 2 GB. 200 lookups allow 3 parallel chains of depth 64 (current maximum). const MAX_LOOKUPS: usize = 200; -/// The values for `Blob`, `DataColumn` and `PartialDataColumn` is the parent root of the column. +type BlockDownloadResponse = Result>>, RpcResponseError>; +type CustodyDownloadResponse = + Result>, RpcResponseError>; + pub enum BlockComponent { Block(DownloadResult>>), - DataColumn(DownloadResult), - PartialDataColumn(DownloadResult), -} - -impl BlockComponent { - fn parent_root(&self) -> Hash256 { - match self { - BlockComponent::Block(block) => block.value.parent_root(), - BlockComponent::DataColumn(parent_root) - | BlockComponent::PartialDataColumn(parent_root) => parent_root.value, - } - } - fn get_type(&self) -> &'static str { - match self { - BlockComponent::Block(_) => "block", - BlockComponent::DataColumn(_) => "data_column", - BlockComponent::PartialDataColumn(_) => "partial_data_column", - } - } + Sidecar, } pub type SingleLookupId = u32; -enum Action { - Retry, - ParentUnknown { parent_root: Hash256 }, - Continue, -} - pub struct BlockLookups { /// A cache of block roots that must be ignored for some time to prevent useless searches. For /// example if a chain is too long, its lookup chain is dropped, and range sync is expected to @@ -193,11 +168,10 @@ impl BlockLookups { &mut self, block_root: Hash256, block_component: BlockComponent, + parent_root: Hash256, peer_id: PeerId, cx: &mut SyncNetworkContext, ) -> bool { - let parent_root = block_component.parent_root(); - let parent_lookup_exists = self.search_parent_of_child(parent_root, block_root, &[peer_id], cx); // Only create the child lookup if the parent exists @@ -207,7 +181,7 @@ impl BlockLookups { block_root, Some(block_component), Some(parent_root), - // On a `UnknownParentBlock` or `UnknownParentDataColumn` event the peer is not + // On a `UnknownParentBlock` or `UnknownParentSidecarHeader` event the peer is not // required to have the rest of the block components. Create the lookup with zero // peers to house the block components. &[], @@ -218,7 +192,7 @@ impl BlockLookups { } } - /// Seach a block whose parent root is unknown. + /// Search a block whose parent root is unknown. /// /// Returns true if the lookup is created or already exists #[must_use = "only reference the new lookup if returns true"] @@ -361,13 +335,9 @@ impl BlockLookups { .find(|(_id, lookup)| lookup.is_for_block(block_root)) { if let Some(block_component) = block_component { - let component_type = block_component.get_type(); let imported = lookup.add_child_components(block_component); if !imported { - debug!( - ?block_root, - component_type, "Lookup child component ignored" - ); + debug!(?block_root, "Lookup child component ignored"); } } @@ -439,88 +409,33 @@ impl BlockLookups { /* Lookup responses */ - /// Process a block or blob response received from a single lookup request. - pub fn on_download_response>( + /// Process a block response received from a single lookup request. + pub fn on_block_download_response( &mut self, id: SingleLookupReqId, - response: Result<(R::VerifiedResponseType, PeerGroup, Duration), RpcResponseError>, + response: BlockDownloadResponse, cx: &mut SyncNetworkContext, ) { - let result = self.on_download_response_inner::(id, response, cx); - self.on_lookup_result(id.lookup_id, result, "download_response", cx); + let Some(lookup) = self.single_block_lookups.get_mut(&id.lookup_id) else { + debug!(?id, "Block returned for single block lookup not present"); + return; + }; + let result = lookup.on_block_download_response(id.req_id, response, cx); + self.on_lookup_result(id.lookup_id, result, "block_download_response", cx); } - /// Process a block or blob response received from a single lookup request. - pub fn on_download_response_inner>( + pub fn on_custody_download_response( &mut self, id: SingleLookupReqId, - response: Result<(R::VerifiedResponseType, PeerGroup, Duration), RpcResponseError>, + response: CustodyDownloadResponse, cx: &mut SyncNetworkContext, - ) -> Result { - // Note: no need to downscore peers here, already downscored on network context - - let response_type = R::response_type(); + ) { let Some(lookup) = self.single_block_lookups.get_mut(&id.lookup_id) else { - // We don't have the ability to cancel in-flight RPC requests. So this can happen - // if we started this RPC request, and later saw the block/blobs via gossip. - debug!(?id, "Block returned for single block lookup not present"); - return Err(LookupRequestError::UnknownLookup); + debug!(?id, "Custody returned for single block lookup not present"); + return; }; - - let block_root = lookup.block_root(); - let request_state = R::request_state_mut(lookup) - .map_err(|e| LookupRequestError::BadState(e.to_owned()))? - .get_state_mut(); - - match response { - Ok((response, peer_group, seen_timestamp)) => { - debug!( - ?block_root, - ?id, - ?peer_group, - ?response_type, - "Received lookup download success" - ); - - // Here we could check if response extends a parent chain beyond its max length. - // However we defer that check to the handling of a processing error ParentUnknown. - // - // Here we could check if there's already a lookup for parent_root of `response`. In - // that case we know that sending the response for processing will likely result in - // a `ParentUnknown` error. However, for simplicity we choose to not implement this - // optimization. - - // Register the download peer here. Once we have received some data over the wire we - // attribute it to this peer for scoring latter regardless of how the request was - // done. - request_state.on_download_success( - id.req_id, - DownloadResult { - value: response, - block_root, - seen_timestamp, - peer_group, - }, - )?; - // continue_request will send for processing as the request state is AwaitingProcessing - } - Err(e) => { - // No need to log peer source here. When sending a DataColumnsByRoot request we log - // the peer and the request ID which is linked to this `id` value here. - debug!( - ?block_root, - ?id, - ?response_type, - error = ?e, - "Received lookup download failure" - ); - - request_state.on_download_failure(id.req_id)?; - // continue_request will retry a download as the request state is AwaitingDownload - } - } - - lookup.continue_requests(cx) + let result = lookup.on_custody_download_response(id.req_id, response, cx); + self.on_lookup_result(id.lookup_id, result, "custody_download_response", cx); } /* Error responses */ @@ -542,128 +457,29 @@ impl BlockLookups { result: BlockProcessingResult, cx: &mut SyncNetworkContext, ) { - let lookup_result = match process_type { - BlockProcessType::SingleBlock { id } => { - self.on_processing_result_inner::>(id, result, cx) - } - BlockProcessType::SingleCustodyColumn(id) => { - self.on_processing_result_inner::>(id, result, cx) - } - // TODO(gloas): route into the payload envelope lookup state machine. - BlockProcessType::SinglePayloadEnvelope(_) => Ok(LookupResult::Pending), - }; - self.on_lookup_result(process_type.id(), lookup_result, "processing_result", cx); - } - - pub fn on_processing_result_inner>( - &mut self, - lookup_id: SingleLookupId, - result: BlockProcessingResult, - cx: &mut SyncNetworkContext, - ) -> Result { + let lookup_id = process_type.id(); let Some(lookup) = self.single_block_lookups.get_mut(&lookup_id) else { debug!(id = lookup_id, "Unknown single block lookup"); - return Err(LookupRequestError::UnknownLookup); + return; }; - let block_root = lookup.block_root(); - let request_state = R::request_state_mut(lookup) - .map_err(|e| LookupRequestError::BadState(e.to_owned()))? - .get_state_mut(); - debug!( - component = ?R::response_type(), - ?block_root, + block_root = ?lookup.block_root(), id = lookup_id, + ?process_type, ?result, "Received lookup processing result" ); - let action = match result { - BlockProcessingResult::Imported(fully_imported, _info) => { - // `on_processing_success` is called here to ensure the request state is updated - // prior to checking if all components have been processed (relevant for - // MissingComponents). - request_state.on_processing_success()?; - - if fully_imported { - Action::Continue - } else if lookup.all_components_processed() { - // We don't request for other block components until being sure that the block has - // data. If we request blobs / columns to a peer we are sure those must exist. - // Therefore if all components are processed and we still receive `MissingComponents` - // it indicates an internal bug. - return Err(LookupRequestError::Failed( - "missing components after all processed".to_owned(), - )); - } else { - Action::Retry - } - } - BlockProcessingResult::ParentUnknown { parent_root } => { - // `BlockError::ParentUnknown` is only returned when processing blocks. Reverts - // the status of this request to `AwaitingProcessing` holding the downloaded - // data. A future call to `continue_requests` will re-submit it once there are - // no pending parent requests. - request_state.revert_to_awaiting_processing()?; - Action::ParentUnknown { parent_root } - } - BlockProcessingResult::Error { penalty, reason } => { - // Retry on every processing error: `on_processing_failure` increments the - // per-component failure counter, so `SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS` bounds the - // retry loop and eventually drops the lookup if the failure persists. Whether the - // peer should be downscored is the producer's call (encoded in `penalty`). - debug!( - ?block_root, - component = ?R::response_type(), - reason, - ?penalty, - "Lookup component processing failed; retrying" - ); - let peer_group = request_state.on_processing_failure()?; - if let Some((action_kind, whom, msg)) = penalty { - whom.apply(action_kind, &peer_group, msg, cx); - } - Action::Retry + let lookup_result = match process_type { + BlockProcessType::SingleBlock { .. } => lookup.on_block_processing_result(result, cx), + BlockProcessType::SingleCustodyColumn(_) => { + lookup.on_data_processing_result(result, cx) } + // TODO(gloas): route into the payload envelope lookup state machine. + BlockProcessType::SinglePayloadEnvelope(_) => Ok(LookupResult::Pending), }; - - match action { - Action::Retry => { - // Trigger download for all components in case `MissingComponents` failed the blob - // request. Also if blobs are `AwaitingProcessing` and need to be progressed - lookup.continue_requests(cx) - } - Action::ParentUnknown { parent_root } => { - let peers = lookup.all_peers(); - // Mark lookup as awaiting **before** creating the parent lookup. At this point the - // lookup maybe inconsistent. - lookup.set_awaiting_parent(parent_root); - let parent_lookup_exists = - self.search_parent_of_child(parent_root, block_root, &peers, cx); - if parent_lookup_exists { - // The parent lookup exist or has been created. It's safe for `lookup` to - // reference the parent as awaiting. - debug!( - id = lookup_id, - ?block_root, - ?parent_root, - "Marking lookup as awaiting parent" - ); - Ok(LookupResult::Pending) - } else { - // The parent lookup is faulty and was not created, we must drop the `lookup` as - // it's in an inconsistent state. We must drop all of its children too. - Err(LookupRequestError::Failed(format!( - "Parent lookup is faulty {parent_root:?}" - ))) - } - } - Action::Continue => { - // Drop this completed lookup only - Ok(LookupResult::Completed) - } - } + self.on_lookup_result(lookup_id, lookup_result, "processing_result", cx); } pub fn on_external_processing_result( @@ -760,7 +576,20 @@ impl BlockLookups { cx: &mut SyncNetworkContext, ) -> bool { match result { - Ok(LookupResult::Pending) => true, // no action + Ok(LookupResult::Pending) => true, + Ok(LookupResult::ParentUnknown { + parent_root, + block_root, + peers, + }) => { + if self.search_parent_of_child(parent_root, block_root, &peers, cx) { + true + } else { + self.drop_lookup_and_children(id, "Failed"); + self.update_metrics(); + false + } + } Ok(LookupResult::Completed) => { if let Some(lookup) = self.single_block_lookups.remove(&id) { debug!( @@ -926,6 +755,7 @@ impl BlockLookups { } /// Adds peers to a lookup and its ancestors recursively. + /// /// Note: Takes a `lookup_id` as argument to allow recursion on mutable lookups, without having /// to duplicate the code to add peers to a lookup fn add_peers_to_lookup_and_ancestors( @@ -952,12 +782,12 @@ impl BlockLookups { } if let Some(parent_root) = lookup.awaiting_parent() { - if let Some((&child_id, _)) = self + if let Some((&parent_id, _)) = self .single_block_lookups .iter() .find(|(_, l)| l.block_root() == parent_root) { - self.add_peers_to_lookup_and_ancestors(child_id, peers, cx) + self.add_peers_to_lookup_and_ancestors(parent_id, peers, cx) } else { Err(format!("Lookup references unknown parent {parent_root:?}")) } diff --git a/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs b/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs index dda58023be..8eb58da4e6 100644 --- a/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs +++ b/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs @@ -1,15 +1,17 @@ use super::{BlockComponent, PeerId, SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS}; -use crate::sync::block_lookups::common::RequestState; +use crate::network_beacon_processor::BlockProcessingResult; +use crate::sync::block_lookups::{BlockDownloadResponse, CustodyDownloadResponse}; +use crate::sync::manager::BlockProcessType; use crate::sync::network_context::{ - LookupRequestResult, PeerGroup, ReqId, RpcRequestSendError, SendErrorProcessor, - SyncNetworkContext, + LookupRequestResult, PeerGroup, ReqId, RpcRequestSendError, RpcResponseError, + SendErrorProcessor, SyncNetworkContext, }; -use beacon_chain::{BeaconChainTypes, BlockProcessStatus}; +use beacon_chain::BeaconChainTypes; +use beacon_chain::block_verification_types::AsBlock; use educe::Educe; use lighthouse_network::service::api_types::Id; use parking_lot::RwLock; use std::collections::HashSet; -use std::fmt::Debug; use std::sync::Arc; use std::time::{Duration, Instant}; use store::Hash256; @@ -24,15 +26,18 @@ pub enum LookupResult { Completed, /// Lookup is expecting some future event from the network Pending, + /// Block's parent is not known to fork-choice, a parent lookup is needed + ParentUnknown { + parent_root: Hash256, + block_root: Hash256, + peers: Vec, + }, } #[derive(Debug, PartialEq, Eq, IntoStaticStr)] pub enum LookupRequestError { /// Too many failed attempts - TooManyAttempts { - /// The failed attempts were primarily due to processing failures. - cannot_process: bool, - }, + TooManyAttempts, /// Error sending event to network SendFailedNetwork(RpcRequestSendError), /// Error sending event to processor @@ -52,33 +57,63 @@ pub enum LookupRequestError { }, } +#[derive(Debug)] +struct BlockRequest { + state: SingleLookupRequestState>>, +} + +impl BlockRequest { + fn new() -> Self { + Self { + state: SingleLookupRequestState::new(), + } + } + + fn is_complete(&self) -> bool { + self.state.is_processed() + } +} + +#[derive(Debug)] +enum DataRequest { + WaitingForBlock, + Request { + slot: Slot, + state: SingleLookupRequestState>, + }, + NoData, +} + +impl DataRequest { + fn is_complete(&self) -> bool { + match &self { + DataRequest::WaitingForBlock => false, + DataRequest::Request { state, .. } => state.is_processed(), + DataRequest::NoData => true, + } + } +} + +type PeerSet = Arc>>; + #[derive(Educe)] #[educe(Debug(bound(T: BeaconChainTypes)))] pub struct SingleBlockLookup { pub id: Id, - pub block_request_state: BlockRequestState, - pub component_requests: ComponentRequests, + block_root: Hash256, + block_request: BlockRequest, + data_request: DataRequest, /// Peers that claim to have imported this set of block components. This state is shared with /// the custody request to have an updated view of the peers that claim to have imported the /// block associated with this lookup. The peer set of a lookup can change rapidly, and faster /// than the lifetime of a custody request. #[educe(Debug(method(fmt_peer_set_as_len)))] - peers: Arc>>, - block_root: Hash256, + peers: PeerSet, awaiting_parent: Option, created: Instant, pub(crate) span: Span, } -#[derive(Debug)] -pub(crate) enum ComponentRequests { - WaitingForBlock, - ActiveCustodyRequest(CustodyRequestState), - // When printing in debug this state display the reason why it's not needed - #[allow(dead_code)] - NotNeeded(&'static str), -} - impl SingleBlockLookup { pub fn new( requested_block_root: Hash256, @@ -94,25 +129,25 @@ impl SingleBlockLookup { Self { id, - block_request_state: BlockRequestState::new(requested_block_root), - component_requests: ComponentRequests::WaitingForBlock, - peers: Arc::new(RwLock::new(HashSet::from_iter(peers.iter().copied()))), block_root: requested_block_root, + block_request: BlockRequest::new(), + data_request: DataRequest::WaitingForBlock, + peers: Arc::new(RwLock::new(peers.iter().copied().collect())), awaiting_parent, created: Instant::now(), span: lookup_span, } } - /// Reset the status of all internal requests + /// Reset the status of all requests (used on block processing failure) pub fn reset_requests(&mut self) { - self.block_request_state = BlockRequestState::new(self.block_root); - self.component_requests = ComponentRequests::WaitingForBlock; + self.block_request = BlockRequest::new(); + self.data_request = DataRequest::WaitingForBlock; } - /// Return the slot of this lookup's block if it's currently cached as `AwaitingProcessing` + /// Return the slot of this lookup's block if it's currently cached pub fn peek_downloaded_block_slot(&self) -> Option { - self.block_request_state + self.block_request .state .peek_downloaded_data() .map(|block| block.slot()) @@ -147,15 +182,12 @@ impl SingleBlockLookup { /// Maybe insert a verified response into this lookup. Returns true if imported pub fn add_child_components(&mut self, block_component: BlockComponent) -> bool { match block_component { - BlockComponent::Block(block) => self - .block_request_state - .state - .insert_verified_response(block), - BlockComponent::DataColumn(_) | BlockComponent::PartialDataColumn(_) => { - // For now ignore single blobs and columns, as the blob request state assumes all blobs are - // attributed to the same peer = the peer serving the remaining blobs. Ignoring this - // block component has a minor effect, causing the node to re-request this blob - // once the parent chain is successfully resolved + BlockComponent::Block(block) => { + self.block_request.state.insert_verified_response(block) + } + BlockComponent::Sidecar => { + // There's nothing to do here, there's no component to insert. The lookup downloads + // its required data columns itself once it has the block. false } } @@ -166,29 +198,14 @@ impl SingleBlockLookup { self.block_root() == block_root } - /// Returns true if the block has already been downloaded. - pub fn all_components_processed(&self) -> bool { - self.block_request_state.state.is_processed() - && match &self.component_requests { - ComponentRequests::WaitingForBlock => false, - ComponentRequests::ActiveCustodyRequest(request) => request.state.is_processed(), - ComponentRequests::NotNeeded { .. } => true, - } - } - /// Returns true if this request is expecting some event to make progress pub fn is_awaiting_event(&self) -> bool { self.awaiting_parent.is_some() - || self.block_request_state.state.is_awaiting_event() - || match &self.component_requests { - // If components are waiting for the block request to complete, here we should - // check if the`block_request_state.state.is_awaiting_event(). However we already - // checked that above, so `WaitingForBlock => false` is equivalent. - ComponentRequests::WaitingForBlock => false, - ComponentRequests::ActiveCustodyRequest(request) => { - request.state.is_awaiting_event() - } - ComponentRequests::NotNeeded { .. } => false, + || self.block_request.state.is_awaiting_event() + || match &self.data_request { + DataRequest::WaitingForBlock => true, + DataRequest::Request { state, .. } => state.is_awaiting_event(), + DataRequest::NoData => false, } } @@ -199,139 +216,167 @@ impl SingleBlockLookup { cx: &mut SyncNetworkContext, ) -> Result { let _guard = self.span.clone().entered(); - // TODO: Check what's necessary to download, specially for blobs - self.continue_request::>(cx, 0)?; - if let ComponentRequests::WaitingForBlock = self.component_requests { - let downloaded_block = self - .block_request_state - .state - .peek_downloaded_data() - .cloned(); - - if let Some(block) = downloaded_block.or_else(|| { - // If the block is already being processed or fully validated, retrieve how many blobs - // it expects. Consider any stage of the block. If the block root has been validated, we - // can assert that this is the correct value of `blob_kzg_commitments_count`. - match cx.chain.get_block_process_status(&self.block_root) { - BlockProcessStatus::Unknown => None, - BlockProcessStatus::NotValidated(block, _) - | BlockProcessStatus::ExecutionValidated(block) => Some(block.clone()), - } - }) { - let expected_blobs = block.num_expected_blobs(); - let block_epoch = block.slot().epoch(T::EthSpec::slots_per_epoch()); - if expected_blobs == 0 { - self.component_requests = ComponentRequests::NotNeeded("no data"); - } else if cx.chain.should_fetch_custody_columns(block_epoch) { - self.component_requests = ComponentRequests::ActiveCustodyRequest( - CustodyRequestState::new(self.block_root, block.slot()), - ); - } else { - self.component_requests = ComponentRequests::NotNeeded("outside da window"); - } - } else { - // Wait to download the block before downloading blobs. Then we can be sure that the - // block has data, so there's no need to do "blind" requests for all possible blobs and - // latter handle the case where if the peer sent no blobs, penalize. - // - // Lookup sync event safety: Reaching this code means that a block is not in any pre-import - // cache nor in the request state of this lookup. Therefore, the block must either: (1) not - // be downloaded yet or (2) the block is already imported into the fork-choice. - // In case (1) the lookup must either successfully download the block or get dropped. - // In case (2) the block will be downloaded, processed, reach `DuplicateFullyImported` - // and get dropped as completed. - } + // === Block request === + self.block_request.state.maybe_start_downloading(|| { + cx.block_lookup_request(self.id, self.peers.clone(), self.block_root) + })?; + if self.awaiting_parent.is_none() + && let Some(data) = self.block_request.state.maybe_start_processing() + { + cx.send_block_for_processing(self.id, self.block_root, data.value, data.seen_timestamp) + .map_err(LookupRequestError::SendFailedProcessor)?; } - match &self.component_requests { - ComponentRequests::WaitingForBlock => {} // do nothing - ComponentRequests::ActiveCustodyRequest(_) => { - self.continue_request::>(cx, 0)? + // === Data request === + loop { + match &mut self.data_request { + DataRequest::WaitingForBlock => { + if let Some(block) = self.block_request.state.peek_downloaded_data() { + let block_epoch = block + .slot() + .epoch(::EthSpec::slots_per_epoch()); + self.data_request = if block.num_expected_blobs() == 0 { + DataRequest::NoData + } else if cx.chain.should_fetch_custody_columns(block_epoch) { + DataRequest::Request { + slot: block.slot(), + state: SingleLookupRequestState::new(), + } + } else { + DataRequest::NoData + }; + } else { + break; + } + } + DataRequest::Request { slot, state } => { + state.maybe_start_downloading(|| { + cx.custody_lookup_request( + self.id, + self.block_root, + *slot, + self.peers.clone(), + ) + })?; + // Wait for the parent to be imported, data column processing result handle does + // not support `ParentUnknown`. + if self.awaiting_parent.is_none() + && let Some(data) = state.maybe_start_processing() + { + cx.send_custody_columns_for_processing( + self.id, + self.block_root, + data.value, + data.seen_timestamp, + BlockProcessType::SingleCustodyColumn(self.id), + ) + .map_err(LookupRequestError::SendFailedProcessor)?; + } + break; + } + DataRequest::NoData => break, } - ComponentRequests::NotNeeded { .. } => {} // do nothing } // If all components of this lookup are already processed, there will be no future events // that can make progress so it must be dropped. Consider the lookup completed. // This case can happen if we receive the components from gossip during a retry. - if self.all_components_processed() { - self.span = Span::none(); - Ok(LookupResult::Completed) - } else { - Ok(LookupResult::Pending) + if self.block_request.is_complete() && self.data_request.is_complete() { + return Ok(LookupResult::Completed); } + + Ok(LookupResult::Pending) } - /// Potentially makes progress on this request if it's in a progress-able state - fn continue_request>( + /// Handle block processing result. Advances the lookup state machine. + pub fn on_block_processing_result( &mut self, + result: BlockProcessingResult, cx: &mut SyncNetworkContext, - expected_blobs: usize, - ) -> Result<(), LookupRequestError> { - let id = self.id; - let awaiting_parent = self.awaiting_parent.is_some(); - let request = - R::request_state_mut(self).map_err(|e| LookupRequestError::BadState(e.to_owned()))?; - - // Attempt to progress awaiting downloads - if request.get_state().is_awaiting_download() { - // Verify the current request has not exceeded the maximum number of attempts. - let request_state = request.get_state(); - if request_state.failed_attempts() >= SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS { - let cannot_process = request_state.more_failed_processing_attempts(); - return Err(LookupRequestError::TooManyAttempts { cannot_process }); + ) -> Result { + match result { + BlockProcessingResult::Imported(_fully_imported, _info) => { + self.block_request.state.on_processing_success()?; } - - let peers = self.peers.clone(); - let request = R::request_state_mut(self) - .map_err(|e| LookupRequestError::BadState(e.to_owned()))?; - - match request.make_request(id, peers, expected_blobs, cx)? { - LookupRequestResult::RequestSent(req_id) => { - // Lookup sync event safety: If make_request returns `RequestSent`, we are - // guaranteed that `BlockLookups::on_download_response` will be called exactly - // with this `req_id`. - request.get_state_mut().on_download_start(req_id)? - } - LookupRequestResult::NoRequestNeeded(reason) => { - // Lookup sync event safety: Advances this request to the terminal `Processed` - // state. If all requests reach this state, the request is marked as completed - // in `Self::continue_requests`. - request.get_state_mut().on_completed_request(reason)? - } - // Sync will receive a future event to make progress on the request, do nothing now - LookupRequestResult::Pending(reason) => { - // Lookup sync event safety: Refer to the code paths constructing - // `LookupRequestResult::Pending` - request - .get_state_mut() - .update_awaiting_download_status(reason); - return Ok(()); + BlockProcessingResult::ParentUnknown { parent_root } => { + // `BlockError::ParentUnknown` is only returned when processing blocks. Revert the + // block request to `Downloaded` and park this lookup until the parent resolves; a + // future call to `continue_requests` will re-submit the block for processing once + // the parent lookup completes. + self.block_request.state.revert_to_awaiting_processing()?; + self.set_awaiting_parent(parent_root); + return Ok(LookupResult::ParentUnknown { + parent_root, + block_root: self.block_root, + peers: self.all_peers(), + }); + } + BlockProcessingResult::Error { penalty, .. } => { + let peers = self.block_request.state.on_processing_failure()?; + if let Some((action, whom, msg)) = penalty { + whom.apply(action, &peers, msg, cx); } } - - // Otherwise, attempt to progress awaiting processing - // If this request is awaiting a parent lookup to be processed, do not send for processing. - // The request will be rejected with unknown parent error. - } else if !awaiting_parent { - // maybe_start_processing returns Some if state == AwaitingProcess. This pattern is - // useful to conditionally access the result data. - if let Some(result) = request.get_state_mut().maybe_start_processing() { - // Lookup sync event safety: If `send_for_processing` returns Ok() we are guaranteed - // that `BlockLookups::on_processing_result` will be called exactly once with this - // lookup_id - return R::send_for_processing(id, result, cx); - } - // Lookup sync event safety: If the request is not in `AwaitingDownload` or - // `AwaitingProcessing` state it is guaranteed to receive some event to make progress. } + self.continue_requests(cx) + } - // Lookup sync event safety: If a lookup is awaiting a parent we are guaranteed to either: - // (1) attempt to make progress with `BlockLookups::continue_child_lookups` if the parent - // lookup completes, or (2) get dropped if the parent fails and is dropped. + /// Handle data processing result + pub fn on_data_processing_result( + &mut self, + result: BlockProcessingResult, + cx: &mut SyncNetworkContext, + ) -> Result { + let DataRequest::Request { state, .. } = &mut self.data_request else { + return Err(LookupRequestError::BadState("no data_request".to_owned())); + }; - Ok(()) + match result { + BlockProcessingResult::Imported(_fully_imported, _info) => { + state.on_processing_success()?; + } + BlockProcessingResult::ParentUnknown { .. } => { + return Err(LookupRequestError::BadState( + "data processing returned ParentUnknown".to_owned(), + )); + } + BlockProcessingResult::Error { penalty, .. } => { + let peers = state.on_processing_failure()?; + if let Some((action, whom, msg)) = penalty { + whom.apply(action, &peers, msg, cx); + } + } + } + self.continue_requests(cx) + } + + /// Handle a block download response. Updates download state and advances the lookup. + pub fn on_block_download_response( + &mut self, + req_id: ReqId, + result: BlockDownloadResponse, + cx: &mut SyncNetworkContext, + ) -> Result { + self.block_request + .state + .on_download_response(req_id, result)?; + self.continue_requests(cx) + } + + /// Handle a custody columns download response. Updates download state and advances the lookup. + pub fn on_custody_download_response( + &mut self, + req_id: ReqId, + result: CustodyDownloadResponse, + cx: &mut SyncNetworkContext, + ) -> Result { + let DataRequest::Request { state, .. } = &mut self.data_request else { + return Err(LookupRequestError::BadState("no data_request".to_owned())); + }; + + state.on_download_response(req_id, result)?; + self.continue_requests(cx) } /// Get all unique peers that claim to have imported this set of block components @@ -340,7 +385,7 @@ impl SingleBlockLookup { } /// Add peer to all request states. The peer must be able to serve this request. - /// Returns true if the peer was newly inserted into some request state. + /// Returns true if the peer was newly inserted into any peer set. pub fn add_peer(&mut self, peer_id: PeerId) -> bool { self.peers.write().insert(peer_id) } @@ -356,52 +401,23 @@ impl SingleBlockLookup { } } -/// The state of the custody request component of a `SingleBlockLookup`. -#[derive(Educe)] -#[educe(Debug)] -pub struct CustodyRequestState { - #[educe(Debug(ignore))] - pub block_root: Hash256, - pub slot: Slot, - pub state: SingleLookupRequestState>, -} - -impl CustodyRequestState { - pub fn new(block_root: Hash256, slot: Slot) -> Self { - Self { - block_root, - slot, - state: SingleLookupRequestState::new(), - } - } -} - -/// The state of the block request component of a `SingleBlockLookup`. -#[derive(Educe)] -#[educe(Debug)] -pub struct BlockRequestState { - #[educe(Debug(ignore))] - pub requested_block_root: Hash256, - pub state: SingleLookupRequestState>>, -} - -impl BlockRequestState { - pub fn new(block_root: Hash256) -> Self { - Self { - requested_block_root: block_root, - state: SingleLookupRequestState::new(), - } - } -} - #[derive(Debug, Clone)] pub struct DownloadResult { pub value: T, - pub block_root: Hash256, pub seen_timestamp: Duration, pub peer_group: PeerGroup, } +impl DownloadResult { + pub fn new(value: T, peer_group: PeerGroup, seen_timestamp: Duration) -> Self { + Self { + value, + seen_timestamp, + peer_group, + } + } +} + #[derive(IntoStaticStr)] pub enum State { AwaitingDownload(/* reason */ &'static str), @@ -410,7 +426,7 @@ pub enum State { /// Request is processing, sent by lookup sync Processing(DownloadResult), /// Request is processed - Processed(/* reason */ &'static str), + Processed(/* reason */ &'static str, T), } /// Object representing the state of a single block or blob lookup request. @@ -477,10 +493,29 @@ impl SingleLookupRequestState { State::Downloading { .. } => None, State::AwaitingProcess(result) => Some(&result.value), State::Processing(result) => Some(&result.value), - State::Processed { .. } => None, + State::Processed(_, value) => Some(value), } } + /// Drive download: check max attempts, issue request, handle result. + fn maybe_start_downloading( + &mut self, + request_fn: impl FnOnce() -> Result, RpcRequestSendError>, + ) -> Result<(), LookupRequestError> { + if self.is_awaiting_download() { + match request_fn().map_err(LookupRequestError::SendFailedNetwork)? { + LookupRequestResult::RequestSent(req_id) => self.on_download_start(req_id)?, + LookupRequestResult::NoRequestNeeded(reason, value) => { + self.on_completed_request(reason, value)? + } + LookupRequestResult::Pending(reason) => { + self.update_awaiting_download_status(reason) + } + } + } + Ok(()) + } + /// Switch to `AwaitingProcessing` if the request is in `AwaitingDownload` state, otherwise /// ignore. pub fn insert_verified_response(&mut self, result: DownloadResult) -> bool { @@ -513,6 +548,17 @@ impl SingleLookupRequestState { } } + pub fn on_download_response( + &mut self, + req_id: ReqId, + result: Result, RpcResponseError>, + ) -> Result<(), LookupRequestError> { + match result { + Ok(result) => self.on_download_success(req_id, result), + Err(_) => self.on_download_failure(req_id), + } + } + /// Registers a failure in downloading a block. This might be a peer disconnection or a wrong /// block. pub fn on_download_failure(&mut self, req_id: ReqId) -> Result<(), LookupRequestError> { @@ -525,6 +571,10 @@ impl SingleLookupRequestState { }); } self.failed_downloading = self.failed_downloading.saturating_add(1); + if self.failed_downloading >= SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS { + return Err(LookupRequestError::TooManyAttempts); + } + self.state = State::AwaitingDownload("not started"); Ok(()) } @@ -589,6 +639,9 @@ impl SingleLookupRequestState { State::Processing(result) => { let peers_source = result.peer_group.clone(); self.failed_processing = self.failed_processing.saturating_add(1); + if self.failed_processing >= SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS { + return Err(LookupRequestError::TooManyAttempts); + } self.state = State::AwaitingDownload("not started"); Ok(peers_source) } @@ -600,8 +653,8 @@ impl SingleLookupRequestState { pub fn on_processing_success(&mut self) -> Result<(), LookupRequestError> { match &self.state { - State::Processing(_) => { - self.state = State::Processed("processing success"); + State::Processing(data) => { + self.state = State::Processed("processing success", data.value.clone()); Ok(()) } other => Err(LookupRequestError::BadState(format!( @@ -611,10 +664,14 @@ impl SingleLookupRequestState { } /// Mark a request as complete without any download or processing - pub fn on_completed_request(&mut self, reason: &'static str) -> Result<(), LookupRequestError> { + pub fn on_completed_request( + &mut self, + reason: &'static str, + value: T, + ) -> Result<(), LookupRequestError> { match &self.state { State::AwaitingDownload { .. } => { - self.state = State::Processed(reason); + self.state = State::Processed(reason, value); Ok(()) } other => Err(LookupRequestError::BadState(format!( @@ -622,15 +679,6 @@ impl SingleLookupRequestState { ))), } } - - /// The total number of failures, whether it be processing or downloading. - pub fn failed_attempts(&self) -> u8 { - self.failed_processing + self.failed_downloading - } - - pub fn more_failed_processing_attempts(&self) -> bool { - self.failed_processing >= self.failed_downloading - } } // Display is used in the BadState assertions above @@ -647,15 +695,15 @@ impl std::fmt::Debug for State { match self { Self::AwaitingDownload(reason) => write!(f, "AwaitingDownload({})", reason), Self::Downloading(req_id) => write!(f, "Downloading({:?})", req_id), - Self::AwaitingProcess(d) => write!(f, "AwaitingProcess({:?})", d.peer_group), - Self::Processing(d) => write!(f, "Processing({:?})", d.peer_group), - Self::Processed(reason) => write!(f, "Processed({})", reason), + Self::AwaitingProcess(_) => write!(f, "AwaitingProcess"), + Self::Processing(_) => write!(f, "Processing"), + Self::Processed(reason, _) => write!(f, "Processed({})", reason), } } } fn fmt_peer_set_as_len( - peer_set: &Arc>>, + peer_set: &PeerSet, f: &mut std::fmt::Formatter, ) -> Result<(), std::fmt::Error> { write!(f, "{}", peer_set.read().len()) diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index ecbe6227cc..166c65b6e1 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -45,9 +45,7 @@ use crate::network_beacon_processor::{ }; use crate::service::NetworkMessage; use crate::status::ToStatusMessage; -use crate::sync::block_lookups::{ - BlockComponent, BlockRequestState, CustodyRequestState, DownloadResult, -}; +use crate::sync::block_lookups::{BlockComponent, DownloadResult}; use crate::sync::custody_backfill_sync::CustodyBackFillSync; use crate::sync::network_context::{PeerGroup, RpcResponseResult}; use beacon_chain::block_verification_types::AsBlock; @@ -144,11 +142,9 @@ pub enum SyncMessage { /// A block with an unknown parent has been received. UnknownParentBlock(PeerId, Arc>, Hash256), - /// A data column with an unknown parent has been received. - UnknownParentDataColumn(PeerId, Arc>), - - /// A partial data column with an unknown parent has been received. - UnknownParentPartialDataColumn { + /// A sidecar (full/partial data column) with an unknown parent has been received. Carries only the header + /// info needed to trigger a parent lookup, decoupled from the concrete sidecar type. + UnknownParentSidecarHeader { peer_id: PeerId, block_root: Hash256, parent_root: Hash256, @@ -869,64 +865,24 @@ impl SyncManager { block_slot, BlockComponent::Block(DownloadResult { value: block.block_cloned(), - block_root, seen_timestamp: self.chain.slot_clock.now_duration().unwrap_or_default(), peer_group: PeerGroup::from_single(peer_id), }), ); } - SyncMessage::UnknownParentDataColumn(peer_id, data_column) => { - let data_column_slot = data_column.slot(); - let block_root = data_column.block_root(); - match data_column.as_ref() { - DataColumnSidecar::Fulu(column) => { - let parent_root = column.block_parent_root(); - debug!(%block_root, %parent_root, "Received unknown parent data column message"); - self.handle_unknown_parent( - peer_id, - block_root, - parent_root, - data_column_slot, - BlockComponent::DataColumn(DownloadResult { - value: parent_root, - block_root, - seen_timestamp: self - .chain - .slot_clock - .now_duration() - .unwrap_or_default(), - peer_group: PeerGroup::from_single(peer_id), - }), - ); - } - DataColumnSidecar::Gloas(_) => { - // TODO(gloas): proper lookup sync for Gloas. Routing into - // `handle_unknown_block_root` here mixes column processing with the - // single-block-lookup path; the Gloas column-arrives-before-block - // case wants its own queue/wakeup. - debug!(%block_root, "Received unknown block data column message"); - self.handle_unknown_block_root(peer_id, block_root); - } - } - } - SyncMessage::UnknownParentPartialDataColumn { + SyncMessage::UnknownParentSidecarHeader { peer_id, block_root, parent_root, slot, } => { - debug!(%block_root, %parent_root, "Received unknown parent partial column message"); + debug!(%block_root, %parent_root, "Received unknown parent sidecar header message"); self.handle_unknown_parent( peer_id, block_root, parent_root, slot, - BlockComponent::PartialDataColumn(DownloadResult { - value: parent_root, - block_root, - seen_timestamp: self.chain.slot_clock.now_duration().unwrap_or_default(), - peer_group: PeerGroup::from_single(peer_id), - }), + BlockComponent::Sidecar, ); } SyncMessage::UnknownBlockHashFromAttestation(peer_id, block_root) => { @@ -1016,6 +972,7 @@ impl SyncManager { if self.block_lookups.search_child_and_parent( block_root, block_component, + parent_root, peer_id, &mut self.network, ) { @@ -1166,14 +1123,13 @@ impl SyncManager { block: RpcEvent>>, ) { if let Some(resp) = self.network.on_single_block_response(id, peer_id, block) { - self.block_lookups - .on_download_response::>( - id, - resp.map(|(value, seen_timestamp)| { - (value, PeerGroup::from_single(peer_id), seen_timestamp) - }), - &mut self.network, - ) + self.block_lookups.on_block_download_response( + id, + resp.map(|(value, seen_timestamp)| { + DownloadResult::new(value, PeerGroup::from_single(peer_id), seen_timestamp) + }), + &mut self.network, + ) } } @@ -1349,11 +1305,7 @@ impl SyncManager { response: CustodyByRootResult, ) { self.block_lookups - .on_download_response::>( - requester.0, - response, - &mut self.network, - ); + .on_custody_download_response(requester.0, response, &mut self.network); } /// Handles receiving a response for a range sync request that should have both blocks and diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 95ae84755c..1e35c0a72f 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -16,7 +16,7 @@ use crate::network_beacon_processor::TestBeaconChainType; use crate::service::NetworkMessage; use crate::status::ToStatusMessage; use crate::sync::batch::ByRangeRequestType; -use crate::sync::block_lookups::SingleLookupId; +use crate::sync::block_lookups::{DownloadResult, SingleLookupId}; use crate::sync::block_sidecar_coupling::CouplingError; use crate::sync::range_data_column_batch_request::RangeDataColumnBatchRequest; use beacon_chain::block_verification_types::LookupBlock; @@ -95,7 +95,7 @@ pub type RpcResponseResult = Result<(T, Duration), RpcResponseError>; /// Duration = latest seen timestamp of all received data columns pub type CustodyByRootResult = - Result<(DataColumnSidecarList, PeerGroup, Duration), RpcResponseError>; + Result>, RpcResponseError>; #[derive(Debug)] pub enum RpcResponseError { @@ -176,13 +176,13 @@ impl PeerGroup { /// Sequential ID that uniquely identifies ReqResp outgoing requests pub type ReqId = u32; -pub enum LookupRequestResult { +pub enum LookupRequestResult { /// A request is sent. Sync MUST receive an event from the network in the future for either: /// completed response or failed request RequestSent(I), /// No request is sent, and no further action is necessary to consider this request completed. /// Includes a reason why this request is not needed. - NoRequestNeeded(&'static str), + NoRequestNeeded(&'static str, T), /// No request is sent, but the request is not completed. Sync MUST receive some future event /// that makes progress on the request. For example: request is processing from a different /// source (i.e. block received from gossip) and sync MUST receive an event with that processing @@ -820,7 +820,7 @@ impl SyncNetworkContext { lookup_id: SingleLookupId, lookup_peers: Arc>>, block_root: Hash256, - ) -> Result { + ) -> Result>>, RpcRequestSendError> { let active_request_count_by_peer = self.active_request_count_by_peer(); let Some(peer_id) = lookup_peers .read() @@ -871,9 +871,10 @@ impl SyncNetworkContext { }, // Block is fully validated. If it's not yet imported it's waiting for missing block // components. Consider this request completed and do nothing. - BlockProcessStatus::ExecutionValidated { .. } => { + BlockProcessStatus::ExecutionValidated(block) => { return Ok(LookupRequestResult::NoRequestNeeded( "block execution validated", + block, )); } } @@ -937,12 +938,13 @@ impl SyncNetworkContext { lookup_id: SingleLookupId, lookup_peers: Arc>>, block_root: Hash256, - ) -> Result { + ) -> Result, RpcRequestSendError> { // Skip the download if fork-choice already saw this envelope (e.g. imported via gossip // before the lookup got here). if self.chain.envelope_is_known_to_fork_choice(&block_root) { return Ok(LookupRequestResult::NoRequestNeeded( "envelope already known to fork-choice", + (), )); } @@ -1011,7 +1013,7 @@ impl SyncNetworkContext { peer_id: PeerId, request: DataColumnsByRootSingleBlockRequest, expect_max_responses: bool, - ) -> Result, &'static str> { + ) -> Result, &'static str> { let id = DataColumnsByRootRequestId { id: self.next_id(), requester, @@ -1060,7 +1062,7 @@ impl SyncNetworkContext { block_root: Hash256, block_slot: Slot, lookup_peers: Arc>>, - ) -> Result { + ) -> Result>, RpcRequestSendError> { let custody_indexes_imported = self .chain .cached_data_column_indexes(&block_root, block_slot) @@ -1078,7 +1080,10 @@ impl SyncNetworkContext { if custody_indexes_to_fetch.is_empty() { // No indexes required, do not issue any request - return Ok(LookupRequestResult::NoRequestNeeded("no indices to fetch")); + return Ok(LookupRequestResult::NoRequestNeeded( + "no indices to fetch", + vec![], + )); } let id = SingleLookupReqId { @@ -1528,8 +1533,8 @@ impl SyncNetworkContext { // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to // an Option first to use in an `if let Some() { act on result }` block. match result.as_ref() { - Some(Ok((columns, peer_group, _))) => { - debug!(?id, count = columns.len(), peers = ?peer_group, "Custody request success, removing") + Some(Ok(data)) => { + debug!(?id, count = data.value.len(), peers = ?data.peer_group, "Custody request success, removing") } Some(Err(e)) => { debug!(?id, error = ?e, "Custody request failure, removing" ) diff --git a/beacon_node/network/src/sync/network_context/custody.rs b/beacon_node/network/src/sync/network_context/custody.rs index 2b96800e37..e74b74ec08 100644 --- a/beacon_node/network/src/sync/network_context/custody.rs +++ b/beacon_node/network/src/sync/network_context/custody.rs @@ -1,3 +1,4 @@ +use crate::sync::block_lookups::DownloadResult; use crate::sync::network_context::{ DataColumnsByRootRequestId, DataColumnsByRootSingleBlockRequest, }; @@ -56,8 +57,7 @@ struct ActiveBatchColumnsRequest { span: Span, } -pub type CustodyRequestResult = - Result, PeerGroup, Duration)>, Error>; +pub type CustodyRequestResult = Result>>, Error>; impl ActiveCustodyRequest { pub(crate) fn new( @@ -227,7 +227,11 @@ impl ActiveCustodyRequest { .into_iter() .max() .unwrap_or_else(|| cx.chain.slot_clock.now_duration().unwrap_or_default()); - return Ok(Some((columns, peer_group, max_seen_timestamp))); + return Ok(Some(DownloadResult::new( + columns, + peer_group, + max_seen_timestamp, + ))); } let active_request_count_by_peer = cx.active_request_count_by_peer(); @@ -343,7 +347,7 @@ impl ActiveCustodyRequest { }, ); } - LookupRequestResult::NoRequestNeeded(_) => unreachable!(), + LookupRequestResult::NoRequestNeeded(..) => unreachable!(), LookupRequestResult::Pending(_) => unreachable!(), } } diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index 1a60e4f243..3ec4d11da2 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -1365,7 +1365,18 @@ impl TestRig { peer_id: PeerId, data_column: Arc>, ) { - self.send_sync_message(SyncMessage::UnknownParentDataColumn(peer_id, data_column)); + let block_root = data_column.block_root(); + let slot = data_column.slot(); + let parent_root = match data_column.as_ref() { + DataColumnSidecar::Fulu(column) => column.block_parent_root(), + DataColumnSidecar::Gloas(_) => panic!("Gloas data column not supported in this test"), + }; + self.send_sync_message(SyncMessage::UnknownParentSidecarHeader { + peer_id, + block_root, + parent_root, + slot, + }); } fn trigger_unknown_block_from_attestation(&mut self, block_root: Hash256, peer_id: PeerId) { diff --git a/beacon_node/src/cli.rs b/beacon_node/src/cli.rs index 647b5858cb..988e2d1fc5 100644 --- a/beacon_node/src/cli.rs +++ b/beacon_node/src/cli.rs @@ -387,6 +387,14 @@ pub fn cli_app() -> Command { .help("Disables the quic transport. The node will rely solely on the TCP transport for libp2p connections.") .display_order(0) ) + .arg( + Arg::new("enable-mplex") + .long("enable-mplex") + .action(ArgAction::SetTrue) + .help_heading(FLAG_HEADER) + .help("Enables mplex multiplexer alongside yamux. Yamux is preferred when both are available.") + .display_order(0) + ) .arg( Arg::new("disable-peer-scoring") .long("disable-peer-scoring") diff --git a/beacon_node/src/config.rs b/beacon_node/src/config.rs index 045b432dc9..ddf8d07c4e 100644 --- a/beacon_node/src/config.rs +++ b/beacon_node/src/config.rs @@ -1443,6 +1443,10 @@ pub fn set_network_config( config.disable_quic_support = true; } + if parse_flag(cli_args, "enable-mplex") { + config.enable_mplex = true; + } + if parse_flag(cli_args, "disable-upnp") { config.upnp_enabled = false; } diff --git a/book/src/help_bn.md b/book/src/help_bn.md index 30163f1f0c..1f57db1b59 100644 --- a/book/src/help_bn.md +++ b/book/src/help_bn.md @@ -494,6 +494,9 @@ Flags: Sets the local ENR IP address and port to match those set for lighthouse. Specifically, the IP address will be the value of --listen-address and the UDP port will be --discovery-port. + --enable-mplex + Enables mplex multiplexer alongside yamux. Yamux is preferred when + both are available. --enable-partial-columns Enable partial messages for data columns. This can reduce the amount of data sent over the network. Enabled by default on Hoodi and