Implement checkpoint sync (#2244)

## Issue Addressed Closes #1891 Closes #1784 ## Proposed Changes Implement checkpoint sync for Lighthouse, enabling it to start from a weak subjectivity checkpoint. ## Additional Info - [x] Return unavailable status for out-of-range blocks requested by peers (#2561) - [x] Implement sync daemon for fetching historical blocks (#2561) - [x] Verify chain hashes (either in `historical_blocks.rs` or the calling module) - [x] Consistency check for initial block + state - [x] Fetch the initial state and block from a beacon node HTTP endpoint - [x] Don't crash fetching beacon states by slot from the API - [x] Background service for state reconstruction, triggered by CLI flag or API call. Considered out of scope for this PR: - Drop the requirement to provide the `--checkpoint-block` (this would require some pretty heavy refactoring of block verification) Co-authored-by: Diva M <divma@protonmail.com>
2026-03-15 10:52:43 +00:00 · 2021-09-22 00:37:28 +00:00
parent 280e4fe23d
commit 9667dc2f03
71 changed files with 4012 additions and 459 deletions
--- a/beacon_node/network/src/sync/range_sync/batch.rs
+++ b/beacon_node/network/src/sync/range_sync/batch.rs
@@ -14,15 +14,34 @@ const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 5;
 /// after `MAX_BATCH_PROCESSING_ATTEMPTS` times, it is considered faulty.
 const MAX_BATCH_PROCESSING_ATTEMPTS: u8 = 3;

+/// Allows customisation of the above constants used in other sync methods such as BackFillSync.
+pub trait BatchConfig {
+    /// The maximum batch download attempts.
+    fn max_batch_download_attempts() -> u8;
+    /// The max batch processing attempts.
+    fn max_batch_processing_attempts() -> u8;
+}
+
+pub struct RangeSyncBatchConfig {}
+
+impl BatchConfig for RangeSyncBatchConfig {
+    fn max_batch_download_attempts() -> u8 {
+        MAX_BATCH_DOWNLOAD_ATTEMPTS
+    }
+    fn max_batch_processing_attempts() -> u8 {
+        MAX_BATCH_PROCESSING_ATTEMPTS
+    }
+}
+
 /// Error type of a batch in a wrong state.
 // Such errors should never be encountered.
-pub struct WrongState(pub(super) String);
+pub struct WrongState(pub(crate) String);

 /// Auxiliary type alias for readability.
 type IsFailed = bool;

 /// A segment of a chain.
-pub struct BatchInfo<T: EthSpec> {
+pub struct BatchInfo<T: EthSpec, B: BatchConfig = RangeSyncBatchConfig> {
    /// Start slot of the batch.
    start_slot: Slot,
    /// End slot of the batch.
@@ -33,6 +52,8 @@ pub struct BatchInfo<T: EthSpec> {
    failed_download_attempts: Vec<PeerId>,
    /// State of the batch.
    state: BatchState<T>,
+    /// Pin the generic
+    marker: std::marker::PhantomData<B>,
 }

 /// Current state of a batch
@@ -73,7 +94,7 @@ impl<T: EthSpec> BatchState<T> {
    }
 }

-impl<T: EthSpec> BatchInfo<T> {
+impl<T: EthSpec, B: BatchConfig> BatchInfo<T, B> {
    /// Batches are downloaded excluding the first block of the epoch assuming it has already been
    /// downloaded.
    ///
@@ -91,6 +112,7 @@ impl<T: EthSpec> BatchInfo<T> {
            failed_processing_attempts: Vec::new(),
            failed_download_attempts: Vec::new(),
            state: BatchState::AwaitingDownload,
+            marker: std::marker::PhantomData,
        }
    }

@@ -120,6 +142,7 @@ impl<T: EthSpec> BatchInfo<T> {
        false
    }

+    /// Returns the peer that is currently responsible for progressing the state of the batch.
    pub fn current_peer(&self) -> Option<&PeerId> {
        match &self.state {
            BatchState::AwaitingDownload | BatchState::Failed => None,
@@ -131,6 +154,7 @@ impl<T: EthSpec> BatchInfo<T> {
        }
    }

+    /// Returns a BlocksByRange request associated with the batch.
    pub fn to_blocks_by_range_request(&self) -> BlocksByRangeRequest {
        BlocksByRangeRequest {
            start_slot: self.start_slot.into(),
@@ -192,7 +216,7 @@ impl<T: EthSpec> BatchInfo<T> {
                        // can be tried again
                        self.failed_download_attempts.push(peer);
                        self.state = if self.failed_download_attempts.len()
-                            >= MAX_BATCH_DOWNLOAD_ATTEMPTS as usize
+                            >= B::max_batch_download_attempts() as usize
                        {
                            BatchState::Failed
                        } else {
@@ -219,14 +243,21 @@ impl<T: EthSpec> BatchInfo<T> {
        }
    }

+    /// Mark the batch as failed and return whether we can attempt a re-download.
+    ///
+    /// This can happen if a peer disconnects or some error occurred that was not the peers fault.
+    /// THe `mark_failed` parameter, when set to false, does not increment the failed attempts of
+    /// this batch and register the peer, rather attempts a re-download.
    #[must_use = "Batch may have failed"]
-    pub fn download_failed(&mut self) -> Result<IsFailed, WrongState> {
+    pub fn download_failed(&mut self, mark_failed: bool) -> Result<IsFailed, WrongState> {
        match self.state.poison() {
            BatchState::Downloading(peer, _, _request_id) => {
                // register the attempt and check if the batch can be tried again
-                self.failed_download_attempts.push(peer);
+                if mark_failed {
+                    self.failed_download_attempts.push(peer);
+                }
                self.state = if self.failed_download_attempts.len()
-                    >= MAX_BATCH_DOWNLOAD_ATTEMPTS as usize
+                    >= B::max_batch_download_attempts as usize
                {
                    BatchState::Failed
                } else {
@@ -294,7 +325,7 @@ impl<T: EthSpec> BatchInfo<T> {

                    // check if the batch can be downloaded again
                    if self.failed_processing_attempts.len()
-                        >= MAX_BATCH_PROCESSING_ATTEMPTS as usize
+                        >= B::max_batch_processing_attempts() as usize
                    {
                        BatchState::Failed
                    } else {
@@ -324,7 +355,7 @@ impl<T: EthSpec> BatchInfo<T> {

                // check if the batch can be downloaded again
                self.state = if self.failed_processing_attempts.len()
-                    >= MAX_BATCH_PROCESSING_ATTEMPTS as usize
+                    >= B::max_batch_processing_attempts() as usize
                {
                    BatchState::Failed
                } else {
@@ -365,7 +396,7 @@ impl Attempt {
    }
 }

-impl<T: EthSpec> slog::KV for &mut BatchInfo<T> {
+impl<T: EthSpec, B: BatchConfig> slog::KV for &mut BatchInfo<T, B> {
    fn serialize(
        &self,
        record: &slog::Record,
@@ -375,7 +406,7 @@ impl<T: EthSpec> slog::KV for &mut BatchInfo<T> {
    }
 }

-impl<T: EthSpec> slog::KV for BatchInfo<T> {
+impl<T: EthSpec, B: BatchConfig> slog::KV for BatchInfo<T, B> {
    fn serialize(
        &self,
        record: &slog::Record,
--- a/beacon_node/network/src/sync/range_sync/chain.rs
+++ b/beacon_node/network/src/sync/range_sync/chain.rs
@@ -181,7 +181,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
            // fail the batches
            for id in batch_ids {
                if let Some(batch) = self.batches.get_mut(&id) {
-                    if batch.download_failed()? {
+                    if batch.download_failed(true)? {
                        return Err(RemoveChain::ChainFailed(id));
                    }
                    self.retry_batch_download(network, id)?;
@@ -273,7 +273,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
        }
    }

-    /// Sends to process the batch with the given id.
+    /// Processes the batch with the given id.
    /// The batch must exist and be ready for processing
    fn process_batch(
        &mut self,
@@ -794,7 +794,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
            if let Some(active_requests) = self.peers.get_mut(peer_id) {
                active_requests.remove(&batch_id);
            }
-            if batch.download_failed()? {
+            if batch.download_failed(true)? {
                return Err(RemoveChain::ChainFailed(batch_id));
            }
            self.retry_batch_download(network, batch_id)
@@ -837,7 +837,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
        }
    }

-    /// Requests the batch asigned to the given id from a given peer.
+    /// Requests the batch assigned to the given id from a given peer.
    pub fn send_batch(
        &mut self,
        network: &mut SyncNetworkContext<T::EthSpec>,
@@ -883,7 +883,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
                    self.peers
                        .get_mut(&peer)
                        .map(|request| request.remove(&batch_id));
-                    if batch.download_failed()? {
+                    if batch.download_failed(true)? {
                        return Err(RemoveChain::ChainFailed(batch_id));
                    } else {
                        return self.retry_batch_download(network, batch_id);
@@ -990,7 +990,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
        // this batch could have been included already being an optimistic batch
        match self.batches.entry(batch_id) {
            Entry::Occupied(_) => {
-                // this batch doesn't need downlading, let this same function decide the next batch
+                // this batch doesn't need downloading, let this same function decide the next batch
                self.to_be_downloaded += EPOCHS_PER_BATCH;
                self.include_next_batch()
            }
--- a/beacon_node/network/src/sync/range_sync/mod.rs
+++ b/beacon_node/network/src/sync/range_sync/mod.rs
@@ -7,7 +7,7 @@ mod chain_collection;
 mod range;
 mod sync_type;

-pub use batch::BatchInfo;
+pub use batch::{BatchConfig, BatchInfo, BatchState};
 pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH};
 pub use range::RangeSync;
 pub use sync_type::RangeSyncType;
--- a/beacon_node/network/src/sync/range_sync/range.rs
+++ b/beacon_node/network/src/sync/range_sync/range.rs
@@ -39,7 +39,7 @@
 //!  Each chain is downloaded in batches of blocks. The batched blocks are processed sequentially
 //!  and further batches are requested as current blocks are being processed.

-use super::chain::{ChainId, RemoveChain, SyncingChain};
+use super::chain::{BatchId, ChainId, RemoveChain, SyncingChain};
 use super::chain_collection::ChainCollection;
 use super::sync_type::RangeSyncType;
 use crate::beacon_processor::WorkEvent as BeaconWorkEvent;
@@ -194,34 +194,29 @@ impl<T: BeaconChainTypes> RangeSync<T> {
        &mut self,
        network: &mut SyncNetworkContext<T::EthSpec>,
        peer_id: PeerId,
+        chain_id: ChainId,
+        batch_id: BatchId,
        request_id: RequestId,
        beacon_block: Option<SignedBeaconBlock<T::EthSpec>>,
    ) {
-        // get the chain and batch for which this response belongs
-        if let Some((chain_id, batch_id)) =
-            network.blocks_by_range_response(request_id, beacon_block.is_none())
-        {
-            // check if this chunk removes the chain
-            match self.chains.call_by_id(chain_id, |chain| {
-                chain.on_block_response(network, batch_id, &peer_id, request_id, beacon_block)
-            }) {
-                Ok((removed_chain, sync_type)) => {
-                    if let Some((removed_chain, remove_reason)) = removed_chain {
-                        self.on_chain_removed(
-                            removed_chain,
-                            sync_type,
-                            remove_reason,
-                            network,
-                            "block response",
-                        );
-                    }
-                }
-                Err(_) => {
-                    trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
+        // check if this chunk removes the chain
+        match self.chains.call_by_id(chain_id, |chain| {
+            chain.on_block_response(network, batch_id, &peer_id, request_id, beacon_block)
+        }) {
+            Ok((removed_chain, sync_type)) => {
+                if let Some((removed_chain, remove_reason)) = removed_chain {
+                    self.on_chain_removed(
+                        removed_chain,
+                        sync_type,
+                        remove_reason,
+                        network,
+                        "block response",
+                    );
                }
            }
-        } else {
-            trace!(self.log, "Response/Error for non registered request"; "request_id" => request_id)
+            Err(_) => {
+                trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
+            }
        }
    }

@@ -298,31 +293,28 @@ impl<T: BeaconChainTypes> RangeSync<T> {
        &mut self,
        network: &mut SyncNetworkContext<T::EthSpec>,
        peer_id: PeerId,
+        batch_id: BatchId,
+        chain_id: ChainId,
        request_id: RequestId,
    ) {
-        // get the chain and batch for which this response belongs
-        if let Some((chain_id, batch_id)) = network.blocks_by_range_response(request_id, true) {
-            // check that this request is pending
-            match self.chains.call_by_id(chain_id, |chain| {
-                chain.inject_error(network, batch_id, &peer_id, request_id)
-            }) {
-                Ok((removed_chain, sync_type)) => {
-                    if let Some((removed_chain, remove_reason)) = removed_chain {
-                        self.on_chain_removed(
-                            removed_chain,
-                            sync_type,
-                            remove_reason,
-                            network,
-                            "RPC error",
-                        );
-                    }
-                }
-                Err(_) => {
-                    trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
+        // check that this request is pending
+        match self.chains.call_by_id(chain_id, |chain| {
+            chain.inject_error(network, batch_id, &peer_id, request_id)
+        }) {
+            Ok((removed_chain, sync_type)) => {
+                if let Some((removed_chain, remove_reason)) = removed_chain {
+                    self.on_chain_removed(
+                        removed_chain,
+                        sync_type,
+                        remove_reason,
+                        network,
+                        "RPC error",
+                    );
                }
            }
-        } else {
-            trace!(self.log, "Response/Error for non registered request"; "request_id" => request_id)
+            Err(_) => {
+                trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
+            }
        }
    }