From 01e4e3527e33e6b2c99b3cbf43ceda263b538862 Mon Sep 17 00:00:00 2001 From: Lion - dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 1 May 2024 23:45:36 +0900 Subject: [PATCH] Check da_checker before doing a block lookup request (#5681) * Check da_checker before doing a block lookup request * Ensure consistent handling of lookup result * use req resp pre import cache rather than da checker --- beacon_node/network/src/metrics.rs | 8 ++ .../network/src/sync/block_lookups/common.rs | 25 ++-- .../network/src/sync/block_lookups/mod.rs | 117 ++++++++++-------- .../sync/block_lookups/single_block_lookup.rs | 65 +++++++--- .../network/src/sync/network_context.rs | 21 +++- 5 files changed, 151 insertions(+), 85 deletions(-) diff --git a/beacon_node/network/src/metrics.rs b/beacon_node/network/src/metrics.rs index 8df22a3d00..d096b33f6c 100644 --- a/beacon_node/network/src/metrics.rs +++ b/beacon_node/network/src/metrics.rs @@ -244,11 +244,19 @@ lazy_static! { "sync_parent_block_lookups", "Number of parent block lookups underway" ); + pub static ref SYNC_LOOKUP_CREATED: Result = try_create_int_counter( + "sync_lookups_created_total", + "Total count of sync lookups created", + ); pub static ref SYNC_LOOKUP_DROPPED: Result = try_create_int_counter_vec( "sync_lookups_dropped_total", "Total count of sync lookups dropped by reason", &["reason"] ); + pub static ref SYNC_LOOKUP_COMPLETED: Result = try_create_int_counter( + "sync_lookups_completed_total", + "Total count of sync lookups completed", + ); /* * Block Delay Metrics diff --git a/beacon_node/network/src/sync/block_lookups/common.rs b/beacon_node/network/src/sync/block_lookups/common.rs index dc82000ef1..06e00ea6d1 100644 --- a/beacon_node/network/src/sync/block_lookups/common.rs +++ b/beacon_node/network/src/sync/block_lookups/common.rs @@ -5,9 +5,7 @@ use crate::sync::block_lookups::{ BlobRequestState, BlockRequestState, PeerId, SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS, }; use crate::sync::manager::{BlockProcessType, Id, SLOT_IMPORT_TOLERANCE}; -use crate::sync::network_context::{ - BlobsByRootSingleBlockRequest, BlocksByRootSingleRequest, SyncNetworkContext, -}; +use crate::sync::network_context::SyncNetworkContext; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::BeaconChainTypes; use std::sync::Arc; @@ -42,9 +40,6 @@ pub(crate) struct BlockIsProcessed(pub bool); /// safety when handling a block/blob response ensuring we only mutate the correct corresponding /// state. pub trait RequestState { - /// The type of the request . - type RequestType; - /// The type created after validation. type VerifiedResponseType: Clone; @@ -71,9 +66,11 @@ pub trait RequestState { .use_rand_available_peer() .ok_or(LookupRequestError::NoPeers)?; - // make_request returns true only if a request was made + // make_request returns true only if a request needs to be made if self.make_request(id, peer_id, downloaded_block_expected_blobs, cx)? { self.get_state_mut().on_download_start()?; + } else { + self.get_state_mut().on_completed_request()?; } // Otherwise, attempt to progress awaiting processing @@ -92,7 +89,9 @@ pub trait RequestState { Ok(()) } - /// Send the request to the network service. + /// Request the network context to prepare a request of a component of `block_root`. If the + /// request is not necessary because the component is already known / processed, return false. + /// Return true if it sent a request and we can expect an event back from the network. fn make_request( &self, id: Id, @@ -126,7 +125,6 @@ pub trait RequestState { } impl RequestState for BlockRequestState { - type RequestType = BlocksByRootSingleRequest; type VerifiedResponseType = Arc>; fn make_request( @@ -136,12 +134,8 @@ impl RequestState for BlockRequestState { _: Option, cx: &mut SyncNetworkContext, ) -> Result { - cx.block_lookup_request( - id, - peer_id, - BlocksByRootSingleRequest(self.requested_block_root), - ) - .map_err(LookupRequestError::SendFailed) + cx.block_lookup_request(id, peer_id, self.requested_block_root) + .map_err(LookupRequestError::SendFailed) } fn send_for_processing( @@ -179,7 +173,6 @@ impl RequestState for BlockRequestState { } impl RequestState for BlobRequestState { - type RequestType = BlobsByRootSingleBlockRequest; type VerifiedResponseType = FixedBlobSidecarList; fn make_request( diff --git a/beacon_node/network/src/sync/block_lookups/mod.rs b/beacon_node/network/src/sync/block_lookups/mod.rs index a0c7c33bb0..6852761d8b 100644 --- a/beacon_node/network/src/sync/block_lookups/mod.rs +++ b/beacon_node/network/src/sync/block_lookups/mod.rs @@ -1,6 +1,6 @@ use self::parent_chain::{compute_parent_chains, NodeChain}; pub use self::single_block_lookup::DownloadResult; -use self::single_block_lookup::{LookupRequestError, SingleBlockLookup}; +use self::single_block_lookup::{LookupRequestError, LookupResult, SingleBlockLookup}; use super::manager::{BlockProcessType, BlockProcessingResult}; use super::network_context::{RpcProcessingResult, SyncNetworkContext}; use crate::metrics; @@ -17,6 +17,7 @@ use lighthouse_network::{PeerAction, PeerId}; use lru_cache::LRUTimeCache; pub use single_block_lookup::{BlobRequestState, BlockRequestState}; use slog::{debug, error, trace, warn, Logger}; +use std::collections::hash_map::Entry; use std::sync::Arc; use std::time::Duration; use store::Hash256; @@ -266,6 +267,7 @@ impl BlockLookups { "peer_ids" => ?peers, "block" => ?block_root, ); + metrics::inc_counter(&metrics::SYNC_LOOKUP_CREATED); // If we know that this lookup has unknown parent (is awaiting a parent lookup to resolve), // signal here to hold processing downloaded data. @@ -276,14 +278,20 @@ impl BlockLookups { lookup.add_child_components(block_component); } - if let Err(e) = lookup.continue_requests(cx) { - self.on_lookup_request_error(lookup.id, e, "new_current_lookup"); - false - } else { - self.single_block_lookups.insert(lookup.id, lookup); - self.update_metrics(); - true - } + let id = lookup.id; + let lookup = match self.single_block_lookups.entry(id) { + Entry::Vacant(entry) => entry.insert(lookup), + Entry::Occupied(_) => { + // Should never happen + warn!(self.log, "Lookup exists with same id"; "id" => id); + return false; + } + }; + + let result = lookup.continue_requests(cx); + self.on_lookup_result(id, result, "new_current_lookup", cx); + self.update_metrics(); + true } /* Lookup responses */ @@ -296,9 +304,8 @@ impl BlockLookups { response: RpcProcessingResult, cx: &mut SyncNetworkContext, ) { - if let Err(e) = self.on_download_response_inner::(id, peer_id, response, cx) { - self.on_lookup_request_error(id, e, "download_response"); - } + let result = self.on_download_response_inner::(id, peer_id, response, cx); + self.on_lookup_result(id, result, "download_response", cx); } /// Process a block or blob response received from a single lookup request. @@ -308,7 +315,7 @@ impl BlockLookups { peer_id: PeerId, response: RpcProcessingResult, cx: &mut SyncNetworkContext, - ) -> Result<(), LookupRequestError> { + ) -> Result { // Downscore peer even if lookup is not known // Only downscore lookup verify errors. RPC errors are downscored in the network handler. if let Err(LookupFailure::LookupVerifyError(e)) = &response { @@ -321,7 +328,7 @@ impl BlockLookups { // We don't have the ability to cancel in-flight RPC requests. So this can happen // if we started this RPC request, and later saw the block/blobs via gossip. debug!(self.log, "Block returned for single block lookup not present"; "id" => id); - return Ok(()); + return Err(LookupRequestError::UnknownLookup); }; let block_root = lookup.block_root(); @@ -346,7 +353,6 @@ impl BlockLookups { peer_id, })?; // continue_request will send for processing as the request state is AwaitingProcessing - lookup.continue_request::(cx) } Err(e) => { debug!(self.log, @@ -359,9 +365,10 @@ impl BlockLookups { request_state.on_download_failure()?; // continue_request will retry a download as the request state is AwaitingDownload - lookup.continue_request::(cx) } } + + lookup.continue_requests(cx) } /* Error responses */ @@ -388,16 +395,15 @@ impl BlockLookups { result: BlockProcessingResult, cx: &mut SyncNetworkContext, ) { - if let Err(e) = match process_type { + let lookup_result = match process_type { BlockProcessType::SingleBlock { id } => { self.on_processing_result_inner::>(id, result, cx) } BlockProcessType::SingleBlob { id } => { self.on_processing_result_inner::>(id, result, cx) } - } { - self.on_lookup_request_error(process_type.id(), e, "processing_result"); - } + }; + self.on_lookup_result(process_type.id(), lookup_result, "processing_result", cx); } pub fn on_processing_result_inner>( @@ -405,10 +411,10 @@ impl BlockLookups { lookup_id: SingleLookupId, result: BlockProcessingResult, cx: &mut SyncNetworkContext, - ) -> Result<(), LookupRequestError> { + ) -> Result { let Some(lookup) = self.single_block_lookups.get_mut(&lookup_id) else { debug!(self.log, "Unknown single block lookup"; "id" => lookup_id); - return Ok(()); + return Err(LookupRequestError::UnknownLookup); }; let block_root = lookup.block_root(); @@ -442,15 +448,17 @@ impl BlockLookups { // wrong. If we already had both a block and blobs response processed, we should penalize the // blobs peer because they did not provide all blobs on the initial request. if lookup.both_components_processed() { - let blob_peer = lookup + if let Some(blob_peer) = lookup .blob_request_state .state - .on_post_process_validation_failure()?; - cx.report_peer( - blob_peer, - PeerAction::MidToleranceError, - "sent_incomplete_blobs", - ); + .on_post_process_validation_failure()? + { + cx.report_peer( + blob_peer, + PeerAction::MidToleranceError, + "sent_incomplete_blobs", + ); + } } Action::Retry } @@ -527,47 +535,41 @@ impl BlockLookups { Action::Retry => { // Trigger download for all components in case `MissingComponents` failed the blob // request. Also if blobs are `AwaitingProcessing` and need to be progressed - lookup.continue_requests(cx)?; + lookup.continue_requests(cx) } Action::ParentUnknown { parent_root } => { let peers = lookup.all_available_peers().cloned().collect::>(); lookup.set_awaiting_parent(parent_root); debug!(self.log, "Marking lookup as awaiting parent"; "lookup" => %block_root, "parent_root" => %parent_root); self.search_parent_of_child(parent_root, block_root, &peers, cx); + Ok(LookupResult::Pending) } Action::Drop => { // Drop with noop - self.drop_lookup_and_children(lookup_id); - self.update_metrics(); + Err(LookupRequestError::Failed) } Action::Continue => { // Drop this completed lookup only - self.single_block_lookups.remove(&lookup_id); - self.update_metrics(); - debug!(self.log, "Dropping completed lookup"; "block" => %block_root); - // Block imported, continue the requests of pending child blocks - self.continue_child_lookups(block_root, cx); + Ok(LookupResult::Completed) } } - Ok(()) } /// Makes progress on the immediate children of `block_root` pub fn continue_child_lookups(&mut self, block_root: Hash256, cx: &mut SyncNetworkContext) { - let mut failed_lookups = vec![]; // < need to clean failed lookups latter to re-borrow &mut self + let mut lookup_results = vec![]; // < need to buffer lookup results to not re-borrow &mut self for (id, lookup) in self.single_block_lookups.iter_mut() { if lookup.awaiting_parent() == Some(block_root) { lookup.resolve_awaiting_parent(); debug!(self.log, "Continuing child lookup"; "parent_root" => %block_root, "block_root" => %lookup.block_root()); - if let Err(e) = lookup.continue_requests(cx) { - failed_lookups.push((*id, e)); - } + let result = lookup.continue_requests(cx); + lookup_results.push((*id, result)); } } - for (id, e) in failed_lookups { - self.on_lookup_request_error(id, e, "continue_child_lookups"); + for (id, result) in lookup_results { + self.on_lookup_result(id, result, "continue_child_lookups", cx); } } @@ -592,16 +594,31 @@ impl BlockLookups { } /// Common handler a lookup request error, drop it and update metrics - fn on_lookup_request_error( + fn on_lookup_result( &mut self, id: SingleLookupId, - error: LookupRequestError, + result: Result, source: &str, + cx: &mut SyncNetworkContext, ) { - debug!(self.log, "Dropping lookup on request error"; "id" => id, "source" => source, "error" => ?error); - metrics::inc_counter_vec(&metrics::SYNC_LOOKUP_DROPPED, &[error.as_metric()]); - self.drop_lookup_and_children(id); - self.update_metrics(); + match result { + Ok(LookupResult::Pending) => {} // no action + Ok(LookupResult::Completed) => { + if let Some(lookup) = self.single_block_lookups.remove(&id) { + debug!(self.log, "Dropping completed lookup"; "block" => %lookup.block_root()); + metrics::inc_counter(&metrics::SYNC_LOOKUP_COMPLETED); + // Block imported, continue the requests of pending child blocks + self.continue_child_lookups(lookup.block_root(), cx); + self.update_metrics(); + } + } + Err(error) => { + debug!(self.log, "Dropping lookup on request error"; "id" => id, "source" => source, "error" => ?error); + metrics::inc_counter_vec(&metrics::SYNC_LOOKUP_DROPPED, &[error.into()]); + self.drop_lookup_and_children(id); + self.update_metrics(); + } + } } /* Helper functions */ diff --git a/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs b/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs index 76deb23674..b642ec8e5b 100644 --- a/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs +++ b/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs @@ -15,6 +15,15 @@ use strum::IntoStaticStr; use types::blob_sidecar::FixedBlobSidecarList; use types::{EthSpec, SignedBeaconBlock}; +// Dedicated enum for LookupResult to force its usage +#[must_use = "LookupResult must be handled with on_lookup_result"] +pub enum LookupResult { + /// Lookup completed successfully + Completed, + /// Lookup is expecting some future event from the network + Pending, +} + #[derive(Debug, PartialEq, Eq, IntoStaticStr)] pub enum LookupRequestError { /// Too many failed attempts @@ -22,9 +31,16 @@ pub enum LookupRequestError { /// The failed attempts were primarily due to processing failures. cannot_process: bool, }, + /// No peers left to serve this lookup NoPeers, + /// Error sending event to network or beacon processor SendFailed(&'static str), + /// Inconsistent lookup request state BadState(String), + /// Lookup failed for some other reason and should be dropped + Failed, + /// Attempted to retrieve a not known lookup id + UnknownLookup, } pub struct SingleBlockLookup { @@ -112,16 +128,29 @@ impl SingleBlockLookup { .unique() } + /// Makes progress on all requests of this lookup. Any error is not recoverable and must result + /// in dropping the lookup. May mark the lookup as completed. pub fn continue_requests( &mut self, cx: &mut SyncNetworkContext, - ) -> Result<(), LookupRequestError> { + ) -> Result { // TODO: Check what's necessary to download, specially for blobs self.continue_request::>(cx)?; self.continue_request::>(cx)?; - Ok(()) + + // If all components of this lookup are already processed, there will be no future events + // that can make progress so it must be dropped. Consider the lookup completed. + // This case can happen if we receive the components from gossip during a retry. + if self.block_request_state.state.is_processed() + && self.blob_request_state.state.is_processed() + { + Ok(LookupResult::Completed) + } else { + Ok(LookupResult::Pending) + } } + /// Wrapper around `RequestState::continue_request` to inject lookup data pub fn continue_request>( &mut self, cx: &mut SyncNetworkContext, @@ -224,7 +253,7 @@ pub enum State { Downloading, AwaitingProcess(DownloadResult), Processing(DownloadResult), - Processed(PeerId), + Processed(Option), } /// Object representing the state of a single block or blob lookup request. @@ -388,7 +417,7 @@ impl SingleLookupRequestState { match &self.state { State::Processing(result) => { let peer_id = result.peer_id; - self.state = State::Processed(peer_id); + self.state = State::Processed(Some(peer_id)); Ok(peer_id) } other => Err(LookupRequestError::BadState(format!( @@ -397,7 +426,9 @@ impl SingleLookupRequestState { } } - pub fn on_post_process_validation_failure(&mut self) -> Result { + pub fn on_post_process_validation_failure( + &mut self, + ) -> Result, LookupRequestError> { match &self.state { State::Processed(peer_id) => { let peer_id = *peer_id; @@ -411,6 +442,19 @@ impl SingleLookupRequestState { } } + /// Mark a request as complete without any download or processing + pub fn on_completed_request(&mut self) -> Result<(), LookupRequestError> { + match &self.state { + State::AwaitingDownload => { + self.state = State::Processed(None); + Ok(()) + } + other => Err(LookupRequestError::BadState(format!( + "Bad state on_completed_request expected AwaitingDownload got {other}" + ))), + } + } + /// The total number of failures, whether it be processing or downloading. pub fn failed_attempts(&self) -> u8 { self.failed_processing + self.failed_downloading @@ -461,14 +505,3 @@ impl std::fmt::Display for State { } } } - -impl LookupRequestError { - pub(crate) fn as_metric(&self) -> &'static str { - match self { - LookupRequestError::TooManyAttempts { .. } => "TooManyAttempts", - LookupRequestError::NoPeers => "NoPeers", - LookupRequestError::SendFailed { .. } => "SendFailed", - LookupRequestError::BadState { .. } => "BadState", - } - } -} diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 860192db68..88495a5b35 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -296,12 +296,25 @@ impl SyncNetworkContext { } } + /// Request block of `block_root` if necessary by checking: + /// - If the da_checker has a pending block from gossip or a previous request + /// + /// Returns false if no request was made, because the block is already imported pub fn block_lookup_request( &mut self, lookup_id: SingleLookupId, peer_id: PeerId, - request: BlocksByRootSingleRequest, + block_root: Hash256, ) -> Result { + if self + .chain + .reqresp_pre_import_cache + .read() + .contains_key(&block_root) + { + return Ok(false); + } + let id = SingleLookupReqId { lookup_id, req_id: self.next_id(), @@ -311,11 +324,13 @@ impl SyncNetworkContext { self.log, "Sending BlocksByRoot Request"; "method" => "BlocksByRoot", - "block_root" => ?request.0, + "block_root" => ?block_root, "peer" => %peer_id, "id" => ?id ); + let request = BlocksByRootSingleRequest(block_root); + self.send_network_msg(NetworkMessage::SendRequest { peer_id, request: Request::BlocksByRoot(request.into_request(&self.chain.spec)), @@ -333,7 +348,7 @@ impl SyncNetworkContext { /// - If the da_checker has a pending block /// - If the da_checker has pending blobs from gossip /// - /// Returns false if no request was made, because we don't need to fetch (more) blobs. + /// Returns false if no request was made, because we don't need to import (more) blobs. pub fn blob_lookup_request( &mut self, lookup_id: SingleLookupId,