De-prioritize failed peers in block and payload lookups (#9542)

Range/backfill sync already de-prioritizes peers that failed a batch download (batch.failed_peers() -> peers_to_deprioritize). Block and payload lookups did not: SingleLookupRequestState only tracked a failure counter, not which peer failed, so a retry could re-select the same flaky peer.


  Track failed_peers per request-state, record the peer on download failure, and feed it into the block_lookup_request / payload_lookup_request peer-selection sort key, mirroring range sync. Custody lookups already have their own per-peer-attempt mechanism and are left unchanged.


Co-Authored-By: dapplion <35266934+dapplion@users.noreply.github.com>
This commit is contained in:
Lion - dapplion
2026-06-26 21:36:56 +02:00
committed by GitHub
parent 812913643b
commit 5e9e86d0c3
4 changed files with 48 additions and 12 deletions

View File

@@ -433,6 +433,7 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
pub fn on_block_download_response( pub fn on_block_download_response(
&mut self, &mut self,
id: SingleLookupReqId, id: SingleLookupReqId,
peer_id: PeerId,
response: BlockDownloadResponse<T::EthSpec>, response: BlockDownloadResponse<T::EthSpec>,
cx: &mut SyncNetworkContext<T>, cx: &mut SyncNetworkContext<T>,
) { ) {
@@ -440,7 +441,7 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
debug!(?id, "Block returned for single block lookup not present"); debug!(?id, "Block returned for single block lookup not present");
return; return;
}; };
let result = lookup.on_block_download_response(id.req_id, response, cx); let result = lookup.on_block_download_response(id.req_id, peer_id, response, cx);
self.on_lookup_result(id.lookup_id, result, "block_download_response", cx); self.on_lookup_result(id.lookup_id, result, "block_download_response", cx);
} }
@@ -461,6 +462,7 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
pub fn on_payload_download_response( pub fn on_payload_download_response(
&mut self, &mut self,
id: SingleLookupReqId, id: SingleLookupReqId,
peer_id: PeerId,
response: PayloadDownloadResponse<T::EthSpec>, response: PayloadDownloadResponse<T::EthSpec>,
cx: &mut SyncNetworkContext<T>, cx: &mut SyncNetworkContext<T>,
) { ) {
@@ -471,7 +473,7 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
); );
return; return;
}; };
let result = lookup.on_payload_download_response(id.req_id, response, cx); let result = lookup.on_payload_download_response(id.req_id, peer_id, response, cx);
self.on_lookup_result(id.lookup_id, result, "payload_download_response", cx); self.on_lookup_result(id.lookup_id, result, "payload_download_response", cx);
} }

View File

@@ -379,9 +379,11 @@ impl<T: BeaconChainTypes> SingleBlockLookup<T> {
let _guard = self.span.clone().entered(); let _guard = self.span.clone().entered();
// === Block request === // === Block request ===
self.block_request.state.maybe_start_downloading(|| { self.block_request
cx.block_lookup_request(self.id, self.peers.clone(), self.block_root) .state
})?; .maybe_start_downloading(|failed_peers| {
cx.block_lookup_request(self.id, self.peers.clone(), failed_peers, self.block_root)
})?;
if self.awaiting_parent.is_none() if self.awaiting_parent.is_none()
&& let Some(data) = self.block_request.state.maybe_start_processing() && let Some(data) = self.block_request.state.maybe_start_processing()
{ {
@@ -412,7 +414,8 @@ impl<T: BeaconChainTypes> SingleBlockLookup<T> {
} }
} }
DataRequest::Request { slot, peers, state } => { DataRequest::Request { slot, peers, state } => {
state.maybe_start_downloading(|| { // Custody selects/de-prioritizes peers internally in `ActiveCustodyRequest`.
state.maybe_start_downloading(|_| {
let req_id = cx.next_id(); let req_id = cx.next_id();
cx.custody_lookup_request( cx.custody_lookup_request(
CustodyRequester::SingleLookup(SingleLookupReqId { CustodyRequester::SingleLookup(SingleLookupReqId {
@@ -464,8 +467,13 @@ impl<T: BeaconChainTypes> SingleBlockLookup<T> {
} }
} }
PayloadRequest::Request { peers, state } => { PayloadRequest::Request { peers, state } => {
state.maybe_start_downloading(|| { state.maybe_start_downloading(|failed_peers| {
cx.payload_lookup_request(self.id, peers.clone(), self.block_root) cx.payload_lookup_request(
self.id,
peers.clone(),
failed_peers,
self.block_root,
)
})?; })?;
// The envelope can only be verified once the block itself is imported; // The envelope can only be verified once the block itself is imported;
// otherwise processing returns `BlockRootUnknown` and the lookup burns retries // otherwise processing returns `BlockRootUnknown` and the lookup burns retries
@@ -621,9 +629,13 @@ impl<T: BeaconChainTypes> SingleBlockLookup<T> {
pub fn on_block_download_response( pub fn on_block_download_response(
&mut self, &mut self,
req_id: ReqId, req_id: ReqId,
peer_id: PeerId,
result: BlockDownloadResponse<T::EthSpec>, result: BlockDownloadResponse<T::EthSpec>,
cx: &mut SyncNetworkContext<T>, cx: &mut SyncNetworkContext<T>,
) -> Result<LookupResult, LookupRequestError> { ) -> Result<LookupResult, LookupRequestError> {
if result.is_err() {
self.block_request.state.record_failed_peer(peer_id);
}
self.block_request self.block_request
.state .state
.on_download_response(req_id, result)?; .on_download_response(req_id, result)?;
@@ -641,6 +653,7 @@ impl<T: BeaconChainTypes> SingleBlockLookup<T> {
return Err(LookupRequestError::BadState("no data_request".to_owned())); return Err(LookupRequestError::BadState("no data_request".to_owned()));
}; };
// Custody requests track and de-prioritize failed peers internally in `ActiveCustodyRequest`.
state.on_download_response(req_id, result)?; state.on_download_response(req_id, result)?;
self.continue_requests(cx) self.continue_requests(cx)
} }
@@ -649,6 +662,7 @@ impl<T: BeaconChainTypes> SingleBlockLookup<T> {
pub fn on_payload_download_response( pub fn on_payload_download_response(
&mut self, &mut self,
req_id: ReqId, req_id: ReqId,
peer_id: PeerId,
result: PayloadDownloadResponse<T::EthSpec>, result: PayloadDownloadResponse<T::EthSpec>,
cx: &mut SyncNetworkContext<T>, cx: &mut SyncNetworkContext<T>,
) -> Result<LookupResult, LookupRequestError> { ) -> Result<LookupResult, LookupRequestError> {
@@ -658,6 +672,9 @@ impl<T: BeaconChainTypes> SingleBlockLookup<T> {
)); ));
}; };
if result.is_err() {
state.record_failed_peer(peer_id);
}
state.on_download_response(req_id, result)?; state.on_download_response(req_id, result)?;
self.continue_requests(cx) self.continue_requests(cx)
} }
@@ -752,6 +769,9 @@ pub struct SingleLookupRequestState<T: Clone> {
failed_processing: u8, failed_processing: u8,
/// How many times have we attempted to download this block or blob. /// How many times have we attempted to download this block or blob.
failed_downloading: u8, failed_downloading: u8,
/// Peers that have failed to serve this request. Used to de-prioritize them when selecting a
/// peer to retry the download from.
failed_peers: HashSet<PeerId>,
} }
impl<T: Clone> SingleLookupRequestState<T> { impl<T: Clone> SingleLookupRequestState<T> {
@@ -760,6 +780,7 @@ impl<T: Clone> SingleLookupRequestState<T> {
state: State::AwaitingDownload("not started"), state: State::AwaitingDownload("not started"),
failed_processing: 0, failed_processing: 0,
failed_downloading: 0, failed_downloading: 0,
failed_peers: HashSet::new(),
} }
} }
@@ -814,10 +835,10 @@ impl<T: Clone> SingleLookupRequestState<T> {
/// Drive download: check max attempts, issue request, handle result. /// Drive download: check max attempts, issue request, handle result.
fn maybe_start_downloading( fn maybe_start_downloading(
&mut self, &mut self,
request_fn: impl FnOnce() -> Result<LookupRequestResult<T>, RpcRequestSendError>, request_fn: impl FnOnce(&HashSet<PeerId>) -> Result<LookupRequestResult<T>, RpcRequestSendError>,
) -> Result<(), LookupRequestError> { ) -> Result<(), LookupRequestError> {
if self.is_awaiting_download() { if self.is_awaiting_download() {
match request_fn().map_err(LookupRequestError::SendFailedNetwork)? { match request_fn(&self.failed_peers).map_err(LookupRequestError::SendFailedNetwork)? {
LookupRequestResult::RequestSent(req_id) => self.on_download_start(req_id)?, LookupRequestResult::RequestSent(req_id) => self.on_download_start(req_id)?,
LookupRequestResult::NoRequestNeeded(reason, value) => { LookupRequestResult::NoRequestNeeded(reason, value) => {
self.on_completed_request(reason, value)? self.on_completed_request(reason, value)?
@@ -862,6 +883,11 @@ impl<T: Clone> SingleLookupRequestState<T> {
} }
} }
/// Record a peer that failed to serve this request, to be de-prioritized on retry.
fn record_failed_peer(&mut self, peer_id: PeerId) {
self.failed_peers.insert(peer_id);
}
pub fn on_download_response( pub fn on_download_response(
&mut self, &mut self,
req_id: ReqId, req_id: ReqId,

View File

@@ -1129,6 +1129,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
if let Some(resp) = self.network.on_single_block_response(id, peer_id, block) { if let Some(resp) = self.network.on_single_block_response(id, peer_id, block) {
self.block_lookups.on_block_download_response( self.block_lookups.on_block_download_response(
id, id,
peer_id,
resp.map(|value| DownloadResult::new(value, PeerGroup::from_single(peer_id))), resp.map(|value| DownloadResult::new(value, PeerGroup::from_single(peer_id))),
&mut self.network, &mut self.network,
) )
@@ -1212,6 +1213,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
{ {
self.block_lookups.on_payload_download_response( self.block_lookups.on_payload_download_response(
id, id,
peer_id,
resp.map(|value| DownloadResult::new(value, PeerGroup::from_single(peer_id))), resp.map(|value| DownloadResult::new(value, PeerGroup::from_single(peer_id))),
&mut self.network, &mut self.network,
) )

View File

@@ -712,6 +712,7 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
&mut self, &mut self,
lookup_id: SingleLookupId, lookup_id: SingleLookupId,
lookup_peers: Arc<RwLock<HashSet<PeerId>>>, lookup_peers: Arc<RwLock<HashSet<PeerId>>>,
peers_to_deprioritize: &HashSet<PeerId>,
block_root: Hash256, block_root: Hash256,
) -> Result<LookupRequestResult<Arc<SignedBeaconBlock<T::EthSpec>>>, RpcRequestSendError> { ) -> Result<LookupRequestResult<Arc<SignedBeaconBlock<T::EthSpec>>>, RpcRequestSendError> {
let blocks_by_root_per_peer = ActiveRequestsPerPeer::new(&self.blocks_by_root_requests); let blocks_by_root_per_peer = ActiveRequestsPerPeer::new(&self.blocks_by_root_requests);
@@ -720,6 +721,8 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
.iter() .iter()
.map(|peer| { .map(|peer| {
( (
// De-prioritize peers that have already failed this request
peers_to_deprioritize.contains(peer),
// Strictly de-prioritize peers already at the per-protocol concurrency limit // Strictly de-prioritize peers already at the per-protocol concurrency limit
blocks_by_root_per_peer.at_concurrency_limit(peer), blocks_by_root_per_peer.at_concurrency_limit(peer),
// Random factor to break ties, otherwise the PeerID breaks ties // Random factor to break ties, otherwise the PeerID breaks ties
@@ -728,7 +731,7 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
) )
}) })
.min() .min()
.map(|(_, _, peer)| *peer) .map(|(_, _, _, peer)| *peer)
else { else {
// Allow lookup to not have any peers and do nothing. This is an optimization to not // Allow lookup to not have any peers and do nothing. This is an optimization to not
// lose progress of lookups created from a block with unknown parent before we receive // lose progress of lookups created from a block with unknown parent before we receive
@@ -818,6 +821,7 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
&mut self, &mut self,
lookup_id: SingleLookupId, lookup_id: SingleLookupId,
lookup_peers: Arc<RwLock<HashSet<PeerId>>>, lookup_peers: Arc<RwLock<HashSet<PeerId>>>,
peers_to_deprioritize: &HashSet<PeerId>,
block_root: Hash256, block_root: Hash256,
) -> Result< ) -> Result<
LookupRequestResult<Arc<SignedExecutionPayloadEnvelope<T::EthSpec>>>, LookupRequestResult<Arc<SignedExecutionPayloadEnvelope<T::EthSpec>>>,
@@ -841,6 +845,8 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
.iter() .iter()
.map(|peer| { .map(|peer| {
( (
// De-prioritize peers that have already failed this request
peers_to_deprioritize.contains(peer),
// Strictly de-prioritize peers already at the per-protocol concurrency limit // Strictly de-prioritize peers already at the per-protocol concurrency limit
payload_envelopes_by_root_per_peer.at_concurrency_limit(peer), payload_envelopes_by_root_per_peer.at_concurrency_limit(peer),
rand::random::<u32>(), rand::random::<u32>(),
@@ -848,7 +854,7 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
) )
}) })
.min() .min()
.map(|(_, _, peer)| *peer) .map(|(_, _, _, peer)| *peer)
else { else {
return Ok(LookupRequestResult::Pending("no peers")); return Ok(LookupRequestResult::Pending("no peers"));
}; };