Sync fixes (#801)

* Randomize peer selection for batch errors * Downgrade attestation logging * Handle range sync errors * Update lock file * Downgrade logs * Decrease batch size for better thread handling * Optimise peer selection in range sync
2026-06-17 10:48:28 +00:00 · 2020-01-15 14:48:09 +11:00
parent 6285dc3aa7
commit c184a98170
10 changed files with 285 additions and 185 deletions
--- a/beacon_node/network/src/sync/network_context.rs
+++ b/beacon_node/network/src/sync/network_context.rs
@@ -36,14 +36,19 @@ impl SyncNetworkContext {
        chain: Weak<BeaconChain<T>>,
        peer_id: PeerId,
    ) {
-        trace!(
-            self.log,
-            "Sending Status Request";
-            "method" => "STATUS",
-            "peer" => format!("{:?}", peer_id)
-        );
        if let Some(chain) = chain.upgrade() {
            if let Some(status_message) = status_message(&chain) {
+                debug!(
+                    self.log,
+                    "Sending Status Request";
+                    "peer" => format!("{:?}", peer_id),
+                    "fork_version" => format!("{:?}", status_message.fork_version),
+                    "finalized_root" => format!("{:?}", status_message.finalized_root),
+                    "finalized_epoch" => format!("{:?}", status_message.finalized_epoch),
+                    "head_root" => format!("{}", status_message.head_root),
+                    "head_slot" => format!("{}", status_message.head_slot),
+                );
+
                let _ = self.send_rpc_request(peer_id, RPCRequest::Status(status_message));
            }
        }
@@ -124,8 +129,7 @@ impl SyncNetworkContext {
        self.network_send
            .try_send(NetworkMessage::RPC(peer_id, rpc_event))
            .map_err(|_| {
-                // This is likely to happen when shutting down. Suppress this warning to trace for now
-                trace!(
+                debug!(
                    self.log,
                    "Could not send RPC message to the network service"
                );
--- a/beacon_node/network/src/sync/range_sync/batch.rs
+++ b/beacon_node/network/src/sync/range_sync/batch.rs
@@ -0,0 +1,107 @@
+use eth2_libp2p::rpc::RequestId;
+use eth2_libp2p::PeerId;
+use fnv::FnvHashMap;
+use std::cmp::Ordering;
+use std::collections::hash_map::Entry;
+use std::collections::{HashMap, HashSet};
+use types::{BeaconBlock, EthSpec, Hash256, Slot};
+
+/// A collection of sequential blocks that are requested from peers in a single RPC request.
+#[derive(PartialEq)]
+pub struct Batch<T: EthSpec> {
+    /// The ID of the batch, these are sequential.
+    pub id: u64,
+    /// The requested start slot of the batch, inclusive.
+    pub start_slot: Slot,
+    /// The requested end slot of batch, exclusive.
+    pub end_slot: Slot,
+    /// The hash of the chain root to requested from the peer.
+    pub head_root: Hash256,
+    /// The peer that was originally assigned to the batch.
+    pub _original_peer: PeerId,
+    /// The peer that is currently assigned to the batch.
+    pub current_peer: PeerId,
+    /// The number of retries this batch has undergone.
+    pub retries: u8,
+    /// The blocks that have been downloaded.
+    pub downloaded_blocks: Vec<BeaconBlock<T>>,
+}
+
+impl<T: EthSpec> Ord for Batch<T> {
+    fn cmp(&self, other: &Self) -> Ordering {
+        self.id.cmp(&other.id)
+    }
+}
+
+impl<T: EthSpec> PartialOrd for Batch<T> {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+/// A structure that contains a mapping of pending batch requests, that also keeps track of which
+/// peers are currently making batch requests.
+///
+/// This is used to optimise searches for idle peers (peers that have no outbound batch requests).
+pub struct PendingBatches<T: EthSpec> {
+    /// The current pending batches.
+    batches: FnvHashMap<RequestId, Batch<T>>,
+    /// A mapping of peers to the number of pending requests.
+    peer_requests: HashMap<PeerId, HashSet<RequestId>>,
+}
+
+impl<T: EthSpec> PendingBatches<T> {
+    pub fn new() -> Self {
+        PendingBatches {
+            batches: FnvHashMap::default(),
+            peer_requests: HashMap::new(),
+        }
+    }
+
+    pub fn insert(&mut self, request_id: RequestId, batch: Batch<T>) -> Option<Batch<T>> {
+        let peer_request = batch.current_peer.clone();
+        self.peer_requests
+            .entry(peer_request)
+            .or_insert_with(|| HashSet::new())
+            .insert(request_id);
+        self.batches.insert(request_id, batch)
+    }
+
+    pub fn remove(&mut self, request_id: &RequestId) -> Option<Batch<T>> {
+        if let Some(batch) = self.batches.remove(request_id) {
+            if let Entry::Occupied(mut entry) = self.peer_requests.entry(batch.current_peer.clone())
+            {
+                entry.get_mut().remove(request_id);
+
+                if entry.get().is_empty() {
+                    entry.remove();
+                }
+            }
+            Some(batch)
+        } else {
+            None
+        }
+    }
+
+    /// Adds a block to the batches if the request id exists. Returns None if there is no batch
+    /// matching the request id.
+    pub fn add_block(&mut self, request_id: &RequestId, block: BeaconBlock<T>) -> Option<()> {
+        let batch = self.batches.get_mut(request_id)?;
+        batch.downloaded_blocks.push(block);
+        Some(())
+    }
+
+    /// Returns true if there the peer does not exist in the peer_requests mapping. Indicating it
+    /// has no pending outgoing requests.
+    pub fn peer_is_idle(&self, peer_id: &PeerId) -> bool {
+        self.peer_requests.get(peer_id).is_none()
+    }
+
+    /// Removes a batch for a given peer.
+    pub fn remove_batch_by_peer(&mut self, peer_id: &PeerId) -> Option<Batch<T>> {
+        let request_ids = self.peer_requests.get(peer_id)?;
+
+        let request_id = request_ids.iter().next()?.clone();
+        self.remove(&request_id)
+    }
+}
--- a/beacon_node/network/src/sync/range_sync/chain.rs
+++ b/beacon_node/network/src/sync/range_sync/chain.rs
@@ -1,12 +1,11 @@
 use crate::message_processor::FUTURE_SLOT_TOLERANCE;
 use crate::sync::network_context::SyncNetworkContext;
+use crate::sync::range_sync::batch::{Batch, PendingBatches};
 use beacon_chain::{BeaconChain, BeaconChainTypes, BlockProcessingOutcome};
 use eth2_libp2p::rpc::methods::*;
 use eth2_libp2p::rpc::RequestId;
 use eth2_libp2p::PeerId;
-use fnv::FnvHashMap;
 use slog::{crit, debug, error, trace, warn, Logger};
-use std::cmp::Ordering;
 use std::collections::HashSet;
 use std::ops::Sub;
 use std::sync::Weak;
@@ -18,44 +17,12 @@ use types::{BeaconBlock, EthSpec, Hash256, Slot};
 /// responder will fill the response up to the max request size, assuming they have the bandwidth
 /// to do so.
 //TODO: Make this dynamic based on peer's bandwidth
-const BLOCKS_PER_BATCH: u64 = 50;
+//TODO: This is lower due to current thread design. Modify once rebuilt.
+const BLOCKS_PER_BATCH: u64 = 25;

 /// The number of times to retry a batch before the chain is considered failed and removed.
 const MAX_BATCH_RETRIES: u8 = 5;

-/// A collection of sequential blocks that are requested from peers in a single RPC request.
-#[derive(PartialEq)]
-pub struct Batch<T: EthSpec> {
-    /// The ID of the batch, batches are ID's sequentially.
-    id: u64,
-    /// The requested start slot of the batch, inclusive.
-    start_slot: Slot,
-    /// The requested end slot of batch, exclusive.
-    end_slot: Slot,
-    /// The hash of the chain root to requested from the peer.
-    head_root: Hash256,
-    /// The peer that was originally assigned to the batch.
-    _original_peer: PeerId,
-    /// The peer that is currently assigned to the batch.
-    pub current_peer: PeerId,
-    /// The number of retries this batch has undergone.
-    retries: u8,
-    /// The blocks that have been downloaded.
-    downloaded_blocks: Vec<BeaconBlock<T>>,
-}
-
-impl<T: EthSpec> Ord for Batch<T> {
-    fn cmp(&self, other: &Self) -> Ordering {
-        self.id.cmp(&other.id)
-    }
-}
-
-impl<T: EthSpec> PartialOrd for Batch<T> {
-    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        Some(self.cmp(other))
-    }
-}
-
 /// A return type for functions that act on a `Chain` which informs the caller whether the chain
 /// has been completed and should be removed or to be kept if further processing is
 /// required.
@@ -105,7 +72,7 @@ pub struct SyncingChain<T: BeaconChainTypes> {

    /// The batches that are currently awaiting a response from a peer. An RPC request for these
    /// have been sent.
-    pub pending_batches: FnvHashMap<RequestId, Batch<T::EthSpec>>,
+    pub pending_batches: PendingBatches<T::EthSpec>,

    /// The batches that have been downloaded and are awaiting processing and/or validation.
    completed_batches: Vec<Batch<T::EthSpec>>,
@@ -151,7 +118,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
            start_slot,
            target_head_slot,
            target_head_root,
-            pending_batches: FnvHashMap::default(),
+            pending_batches: PendingBatches::new(),
            completed_batches: Vec::new(),
            peer_pool,
            to_be_downloaded_id: 1,
@@ -176,9 +143,8 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
        log: &slog::Logger,
    ) -> Option<ProcessingResult> {
        if let Some(block) = beacon_block {
-            let batch = self.pending_batches.get_mut(&request_id)?;
            // This is not a stream termination, simply add the block to the request
-            batch.downloaded_blocks.push(block.clone());
+            self.pending_batches.add_block(&request_id, block.clone())?;
            return Some(ProcessingResult::KeepChain);
        } else {
            // A stream termination has been sent. This batch has ended. Process a completed batch.
@@ -202,10 +168,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
        // blocks for the peer.
        debug!(log, "Completed batch received"; "id"=>batch.id, "blocks"=>batch.downloaded_blocks.len(), "awaiting_batches" => self.completed_batches.len());

-        // The peer that completed this batch, may be re-requested if this batch doesn't complete
-        // the chain and there is no error in processing
-        let current_peer = batch.current_peer.clone();
-
        // verify the range of received blocks
        // Note that the order of blocks is verified in block processing
        if let Some(last_slot) = batch.downloaded_blocks.last().map(|b| b.slot) {
@@ -244,7 +206,9 @@ impl<T: BeaconChainTypes> SyncingChain<T> {

        if self.state == ChainSyncingState::Syncing {
            // pre-emptively request more blocks from peers whilst we process current blocks,
-            self.send_range_request(network, current_peer, log);
+            if !self.send_range_request(network, log) {
+                debug!(log, "No peer available for next batch.")
+            }
        }

        // Try and process batches sequentially in the ordered list.
@@ -424,38 +388,22 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
                .retain(|batch| batch.id >= last_processed_id.saturating_sub(1));
        }

-        // Now begin requesting blocks from the peer pool. Ignore any peers with currently
-        // pending requests
-        let pending_peers = self
-            .pending_batches
-            .values()
-            .map(|batch| batch.current_peer.clone())
-            .collect::<Vec<_>>();
-
-        let peers = self
-            .peer_pool
-            .iter()
-            .filter(|peer| !pending_peers.contains(peer))
-            .cloned()
-            .collect::<Vec<_>>();
-
-        for peer_id in peers {
-            // send a blocks by range request to the peer
-            self.send_range_request(network, peer_id, log);
-        }
+        // Now begin requesting blocks from the peer pool, until all peers are exhausted.
+        while self.send_range_request(network, log) {}

        self.state = ChainSyncingState::Syncing;
    }

-    /// A peer has been added.
+    /// Add a peer to the chain.
    ///
    /// If the chain is active, this starts requesting batches from this peer.
-    pub fn peer_added(
+    pub fn add_peer(
        &mut self,
        network: &mut SyncNetworkContext,
        peer_id: PeerId,
        log: &slog::Logger,
    ) {
+        self.peer_pool.insert(peer_id.clone());
        // do not request blocks if the chain is not syncing
        if let ChainSyncingState::Stopped = self.state {
            debug!(log, "Peer added to a non-syncing chain"; "peer_id" => format!("{:?}", peer_id));
@@ -463,7 +411,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
        }

        // find the next batch and request it from the peer
-        self.send_range_request(network, peer_id, log);
+        self.send_range_request(network, log);
    }

    /// Sends a STATUS message to all peers in the peer pool.
@@ -473,19 +421,31 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
        }
    }

-    /// Requests the next required batch from the provided peer.
-    fn send_range_request(
-        &mut self,
-        network: &mut SyncNetworkContext,
-        peer_id: PeerId,
-        log: &slog::Logger,
-    ) {
+    /// Requests the next required batch from a peer. Returns true, if there was a peer available
+    /// to send a request and there are batches to request, false otherwise.
+    fn send_range_request(&mut self, network: &mut SyncNetworkContext, log: &slog::Logger) -> bool {
        // find the next pending batch and request it from the peer
-        if let Some(batch) = self.get_next_batch(peer_id) {
-            debug!(log, "Requesting batch"; "start_slot" => batch.start_slot, "end_slot" => batch.end_slot, "id" => batch.id, "peer" => format!("{:?}", batch.current_peer), "head_root"=> format!("{}", batch.head_root));
-            // send the batch
-            self.send_batch(network, batch);
+        if let Some(peer_id) = self.get_next_peer() {
+            if let Some(batch) = self.get_next_batch(peer_id) {
+                debug!(log, "Requesting batch"; "start_slot" => batch.start_slot, "end_slot" => batch.end_slot, "id" => batch.id, "peer" => format!("{:?}", batch.current_peer), "head_root"=> format!("{}", batch.head_root));
+                // send the batch
+                self.send_batch(network, batch);
+                return true;
+            }
        }
+        return false;
+    }
+
+    /// Returns a peer if there exists a peer which does not currently have a pending request.
+    ///
+    /// This is used to create the next request.
+    fn get_next_peer(&self) -> Option<PeerId> {
+        for peer in self.peer_pool.iter() {
+            if self.pending_batches.peer_is_idle(peer) {
+                return Some(peer.clone());
+            }
+        }
+        None
    }

    /// Requests the provided batch from the provided peer.
@@ -546,7 +506,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
        if let Some(batch) = self.pending_batches.remove(&request_id) {
            warn!(log, "Batch failed. RPC Error"; "id" => batch.id, "retries" => batch.retries, "peer" => format!("{:?}", peer_id));

-            Some(self.failed_batch(network, batch))
+            Some(self.failed_batch(network, batch, log))
        } else {
            None
        }
@@ -561,6 +521,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
        &mut self,
        network: &mut SyncNetworkContext,
        mut batch: Batch<T::EthSpec>,
+        log: &Logger,
    ) -> ProcessingResult {
        batch.retries += 1;

@@ -580,6 +541,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
                .unwrap_or_else(|| current_peer);

            batch.current_peer = new_peer.clone();
+            debug!(log, "Re-Requesting batch"; "start_slot" => batch.start_slot, "end_slot" => batch.end_slot, "id" => batch.id, "peer" => format!("{:?}", batch.current_peer), "head_root"=> format!("{}", batch.head_root));
            self.send_batch(network, batch);
            ProcessingResult::KeepChain
        }
--- a/beacon_node/network/src/sync/range_sync/mod.rs
+++ b/beacon_node/network/src/sync/range_sync/mod.rs
@@ -1,6 +1,7 @@
 //! This provides the logic for syncing a chain when the local node is far behind it's current
 //! peers.

+mod batch;
 mod chain;
 mod chain_collection;
 mod range;
--- a/beacon_node/network/src/sync/range_sync/range.rs
+++ b/beacon_node/network/src/sync/range_sync/range.rs
@@ -154,8 +154,7 @@ impl<T: BeaconChainTypes> RangeSync<T> {
                debug!(self.log, "Finalized chain exists, adding peer"; "peer_id" => format!("{:?}", peer_id), "target_root" => format!("{}", chain.target_head_root), "end_slot" => chain.target_head_slot, "start_slot"=> chain.start_slot);

                // add the peer to the chain's peer pool
-                chain.peer_pool.insert(peer_id.clone());
-                chain.peer_added(network, peer_id, &self.log);
+                chain.add_peer(network, peer_id, &self.log);

                // check if the new peer's addition will favour a new syncing chain.
                self.chains.update_finalized(network, &self.log);
@@ -189,8 +188,7 @@ impl<T: BeaconChainTypes> RangeSync<T> {
                debug!(self.log, "Adding peer to the existing head chain peer pool"; "head_root" => format!("{}",remote.head_root), "head_slot" => remote.head_slot, "peer_id" => format!("{:?}", peer_id));

                // add the peer to the head's pool
-                chain.peer_pool.insert(peer_id.clone());
-                chain.peer_added(network, peer_id.clone(), &self.log);
+                chain.add_peer(network, peer_id.clone(), &self.log);
            } else {
                // There are no other head chains that match this peer's status, create a new one, and
                let start_slot = std::cmp::min(local_info.head_slot, remote_finalized_slot);
@@ -306,22 +304,16 @@ impl<T: BeaconChainTypes> RangeSync<T> {
    /// for this peer. If so we mark the batch as failed. The batch may then hit it's maximum
    /// retries. In this case, we need to remove the chain and re-status all the peers.
    fn remove_peer(&mut self, network: &mut SyncNetworkContext, peer_id: &PeerId) {
+        let log_ref = &self.log;
        match self.chains.head_finalized_request(|chain| {
-            if chain.peer_pool.remove(&peer_id) {
+            if chain.peer_pool.remove(peer_id) {
                // this chain contained the peer
-                let pending_batches_requests = chain
-                    .pending_batches
-                    .iter()
-                    .filter(|(_, batch)| batch.current_peer == *peer_id)
-                    .map(|(id, _)| id)
-                    .cloned()
-                    .collect::<Vec<_>>();
-                for request_id in pending_batches_requests {
-                    if let Some(batch) = chain.pending_batches.remove(&request_id) {
-                        if let ProcessingResult::RemoveChain = chain.failed_batch(network, batch) {
-                            // a single batch failed, remove the chain
-                            return Some(ProcessingResult::RemoveChain);
-                        }
+                while let Some(batch) = chain.pending_batches.remove_batch_by_peer(peer_id) {
+                    if let ProcessingResult::RemoveChain =
+                        chain.failed_batch(network, batch, log_ref)
+                    {
+                        // a single batch failed, remove the chain
+                        return Some(ProcessingResult::RemoveChain);
                    }
                }
                // peer removed from chain, no batch failed