mirror of
https://github.com/sigp/lighthouse.git
synced 2026-03-20 13:24:44 +00:00
Remove all batches related to a peer on disconnect (#5969)
* Remove all batches related to a peer on disconnect * Cleanup map entries after disconnect * Allow lookups to continue in case of disconnections * Pretty response types * fmt * Fix lints * Remove lookup if it cannot progress * Fix tests * Remove poll_close on rpc behaviour * Remove redundant test * Fix issue raised by lion * Revert pretty response types * Cleanup * Fix test * Merge remote-tracking branch 'origin/release-v5.2.1' into rpc-error-on-disconnect-revert * Apply suggestions from joao Co-authored-by: João Oliveira <hello@jxs.pt> * Fix log * update request status on no peers found * Do not remove lookup after peer disconnection * Add comments about expected event api * Update single_block_lookup.rs * Update mod.rs * Merge branch 'rpc-error-on-disconnect-revert' into 5969-review * Merge pull request #10 from dapplion/5969-review Add comments about expected event api
This commit is contained in:
@@ -1,3 +1,25 @@
|
||||
//! Implements block lookup sync.
|
||||
//!
|
||||
//! Block lookup sync is triggered when a peer claims to have imported a block we don't know about.
|
||||
//! For example, a peer attesting to a head block root that is not in our fork-choice. Lookup sync
|
||||
//! is recursive in nature, as we may discover that this attested head block root has a parent that
|
||||
//! is also unknown to us.
|
||||
//!
|
||||
//! Block lookup is implemented as an event-driven state machine. It sends events to the network and
|
||||
//! beacon processor, and expects some set of events back. A discrepancy in the expected event API
|
||||
//! will result in lookups getting "stuck". A lookup becomes stuck when there is no future event
|
||||
//! that will trigger the lookup to make progress. There's a fallback mechanism that drops lookups
|
||||
//! that live for too long, logging the line "Notify the devs a sync lookup is stuck".
|
||||
//!
|
||||
//! The expected event API is documented in the code paths that are making assumptions with the
|
||||
//! comment prefix "Lookup sync event safety:"
|
||||
//!
|
||||
//! Block lookup sync attempts to not re-download or re-process data that we already have. Block
|
||||
//! components are cached temporarily in multiple places before they are imported into fork-choice.
|
||||
//! Therefore, block lookup sync must peek these caches correctly to decide when to skip a download
|
||||
//! or consider a lookup complete. These caches are read from the `SyncNetworkContext` and its state
|
||||
//! returned to this module as `LookupRequestResult` variants.
|
||||
|
||||
use self::parent_chain::{compute_parent_chains, NodeChain};
|
||||
pub use self::single_block_lookup::DownloadResult;
|
||||
use self::single_block_lookup::{LookupRequestError, LookupResult, SingleBlockLookup};
|
||||
@@ -410,21 +432,9 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
/* Error responses */
|
||||
|
||||
pub fn peer_disconnected(&mut self, peer_id: &PeerId) {
|
||||
self.single_block_lookups.retain(|_, lookup| {
|
||||
for (_, lookup) in self.single_block_lookups.iter_mut() {
|
||||
lookup.remove_peer(peer_id);
|
||||
|
||||
// Note: this condition should be removed in the future. It's not strictly necessary to drop a
|
||||
// lookup if there are no peers left. Lookup should only be dropped if it can not make progress
|
||||
if lookup.has_no_peers() {
|
||||
debug!(self.log,
|
||||
"Dropping single lookup after peer disconnection";
|
||||
"block_root" => ?lookup.block_root()
|
||||
);
|
||||
false
|
||||
} else {
|
||||
true
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/* Processing responses */
|
||||
@@ -787,12 +797,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
};
|
||||
|
||||
if stuck_lookup.id == ancestor_stuck_lookup.id {
|
||||
warn!(self.log, "Notify the devs, a sync lookup is stuck";
|
||||
warn!(self.log, "Notify the devs a sync lookup is stuck";
|
||||
"block_root" => ?stuck_lookup.block_root(),
|
||||
"lookup" => ?stuck_lookup,
|
||||
);
|
||||
} else {
|
||||
warn!(self.log, "Notify the devs, a sync lookup is stuck";
|
||||
warn!(self.log, "Notify the devs a sync lookup is stuck";
|
||||
"block_root" => ?stuck_lookup.block_root(),
|
||||
"lookup" => ?stuck_lookup,
|
||||
"ancestor_block_root" => ?ancestor_stuck_lookup.block_root(),
|
||||
|
||||
@@ -197,21 +197,36 @@ impl<T: BeaconChainTypes> SingleBlockLookup<T> {
|
||||
}
|
||||
|
||||
let Some(peer_id) = self.use_rand_available_peer() else {
|
||||
// Allow lookup to not have any peers. In that case do nothing. If the lookup does
|
||||
// not have peers for some time, it will be dropped.
|
||||
// Allow lookup to not have any peers and do nothing. This is an optimization to not
|
||||
// lose progress of lookups created from a block with unknown parent before we receive
|
||||
// attestations for said block.
|
||||
// Lookup sync event safety: If a lookup requires peers to make progress, and does
|
||||
// not receive any new peers for some time it will be dropped. If it receives a new
|
||||
// peer it must attempt to make progress.
|
||||
R::request_state_mut(self)
|
||||
.get_state_mut()
|
||||
.update_awaiting_download_status("no peers");
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
let request = R::request_state_mut(self);
|
||||
match request.make_request(id, peer_id, downloaded_block_expected_blobs, cx)? {
|
||||
LookupRequestResult::RequestSent(req_id) => {
|
||||
// Lookup sync event safety: If make_request returns `RequestSent`, we are
|
||||
// guaranteed that `BlockLookups::on_download_response` will be called exactly
|
||||
// with this `req_id`.
|
||||
request.get_state_mut().on_download_start(req_id)?
|
||||
}
|
||||
LookupRequestResult::NoRequestNeeded => {
|
||||
// Lookup sync event safety: Advances this request to the terminal `Processed`
|
||||
// state. If all requests reach this state, the request is marked as completed
|
||||
// in `Self::continue_requests`.
|
||||
request.get_state_mut().on_completed_request()?
|
||||
}
|
||||
// Sync will receive a future event to make progress on the request, do nothing now
|
||||
LookupRequestResult::Pending(reason) => {
|
||||
// Lookup sync event safety: Refer to the code paths constructing
|
||||
// `LookupRequestResult::Pending`
|
||||
request
|
||||
.get_state_mut()
|
||||
.update_awaiting_download_status(reason);
|
||||
@@ -222,16 +237,28 @@ impl<T: BeaconChainTypes> SingleBlockLookup<T> {
|
||||
// Otherwise, attempt to progress awaiting processing
|
||||
// If this request is awaiting a parent lookup to be processed, do not send for processing.
|
||||
// The request will be rejected with unknown parent error.
|
||||
//
|
||||
// TODO: The condition `block_is_processed || Block` can be dropped after checking for
|
||||
// unknown parent root when import RPC blobs
|
||||
} else if !awaiting_parent
|
||||
&& (block_is_processed || matches!(R::response_type(), ResponseType::Block))
|
||||
{
|
||||
// maybe_start_processing returns Some if state == AwaitingProcess. This pattern is
|
||||
// useful to conditionally access the result data.
|
||||
if let Some(result) = request.get_state_mut().maybe_start_processing() {
|
||||
// Lookup sync event safety: If `send_for_processing` returns Ok() we are guaranteed
|
||||
// that `BlockLookups::on_processing_result` will be called exactly once with this
|
||||
// lookup_id
|
||||
return R::send_for_processing(id, result, cx);
|
||||
}
|
||||
// Lookup sync event safety: If the request is not in `AwaitingDownload` or
|
||||
// `AwaitingProcessing` state it is guaranteed to receive some event to make progress.
|
||||
}
|
||||
|
||||
// Lookup sync event safety: If a lookup is awaiting a parent we are guaranteed to either:
|
||||
// (1) attempt to make progress with `BlockLookups::continue_child_lookups` if the parent
|
||||
// lookup completes, or (2) get dropped if the parent fails and is dropped.
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -246,10 +273,9 @@ impl<T: BeaconChainTypes> SingleBlockLookup<T> {
|
||||
self.peers.insert(peer_id)
|
||||
}
|
||||
|
||||
/// Remove peer from available peers. Return true if there are no more available peers and all
|
||||
/// requests are not expecting any future event (AwaitingDownload).
|
||||
pub fn remove_peer(&mut self, peer_id: &PeerId) -> bool {
|
||||
self.peers.remove(peer_id)
|
||||
/// Remove peer from available peers.
|
||||
pub fn remove_peer(&mut self, peer_id: &PeerId) {
|
||||
self.peers.remove(peer_id);
|
||||
}
|
||||
|
||||
/// Returns true if this lookup has zero peers
|
||||
|
||||
@@ -290,6 +290,7 @@ impl TestRig {
|
||||
.0
|
||||
}
|
||||
|
||||
#[track_caller]
|
||||
fn expect_no_active_single_lookups(&self) {
|
||||
assert!(
|
||||
self.active_single_lookups().is_empty(),
|
||||
@@ -298,6 +299,7 @@ impl TestRig {
|
||||
);
|
||||
}
|
||||
|
||||
#[track_caller]
|
||||
fn expect_no_active_lookups(&self) {
|
||||
self.expect_no_active_single_lookups();
|
||||
}
|
||||
@@ -539,10 +541,6 @@ impl TestRig {
|
||||
})
|
||||
}
|
||||
|
||||
fn peer_disconnected(&mut self, disconnected_peer_id: PeerId) {
|
||||
self.send_sync_message(SyncMessage::Disconnect(disconnected_peer_id));
|
||||
}
|
||||
|
||||
/// Return RPCErrors for all active requests of peer
|
||||
fn rpc_error_all_active_requests(&mut self, disconnected_peer_id: PeerId) {
|
||||
self.drain_network_rx();
|
||||
@@ -562,6 +560,10 @@ impl TestRig {
|
||||
}
|
||||
}
|
||||
|
||||
fn peer_disconnected(&mut self, peer_id: PeerId) {
|
||||
self.send_sync_message(SyncMessage::Disconnect(peer_id));
|
||||
}
|
||||
|
||||
fn drain_network_rx(&mut self) {
|
||||
while let Ok(event) = self.network_rx.try_recv() {
|
||||
self.network_rx_queue.push(event);
|
||||
@@ -1026,6 +1028,28 @@ fn test_single_block_lookup_failure() {
|
||||
rig.expect_empty_network();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_single_block_lookup_peer_disconnected_then_rpc_error() {
|
||||
let mut rig = TestRig::test_setup();
|
||||
|
||||
let block_hash = Hash256::random();
|
||||
let peer_id = rig.new_connected_peer();
|
||||
|
||||
// Trigger the request.
|
||||
rig.trigger_unknown_block_from_attestation(block_hash, peer_id);
|
||||
let id = rig.expect_block_lookup_request(block_hash);
|
||||
|
||||
// The peer disconnect event reaches sync before the rpc error.
|
||||
rig.peer_disconnected(peer_id);
|
||||
// The lookup is not removed as it can still potentially make progress.
|
||||
rig.assert_single_lookups_count(1);
|
||||
// The request fails.
|
||||
rig.single_lookup_failed(id, peer_id, RPCError::Disconnected);
|
||||
rig.expect_block_lookup_request(block_hash);
|
||||
// The request should be removed from the network context on disconnection.
|
||||
rig.expect_empty_network();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_single_block_lookup_becomes_parent_request() {
|
||||
let mut rig = TestRig::test_setup();
|
||||
@@ -1289,19 +1313,9 @@ fn test_lookup_peer_disconnected_no_peers_left_while_request() {
|
||||
rig.trigger_unknown_parent_block(peer_id, trigger_block.into());
|
||||
rig.peer_disconnected(peer_id);
|
||||
rig.rpc_error_all_active_requests(peer_id);
|
||||
rig.expect_no_active_lookups();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lookup_peer_disconnected_no_peers_left_not_while_request() {
|
||||
let mut rig = TestRig::test_setup();
|
||||
let peer_id = rig.new_connected_peer();
|
||||
let trigger_block = rig.rand_block();
|
||||
rig.trigger_unknown_parent_block(peer_id, trigger_block.into());
|
||||
rig.peer_disconnected(peer_id);
|
||||
// Note: this test case may be removed in the future. It's not strictly necessary to drop a
|
||||
// lookup if there are no peers left. Lookup should only be dropped if it can not make progress
|
||||
rig.expect_no_active_lookups();
|
||||
// Erroring all rpc requests and disconnecting the peer shouldn't remove the requests
|
||||
// from the lookups map as they can still progress.
|
||||
rig.assert_single_lookups_count(2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
Reference in New Issue
Block a user