Implement checkpoint sync (#2244)

## Issue Addressed

Closes #1891
Closes #1784

## Proposed Changes

Implement checkpoint sync for Lighthouse, enabling it to start from a weak subjectivity checkpoint.

## Additional Info

- [x] Return unavailable status for out-of-range blocks requested by peers (#2561)
- [x] Implement sync daemon for fetching historical blocks (#2561)
- [x] Verify chain hashes (either in `historical_blocks.rs` or the calling module)
- [x] Consistency check for initial block + state
- [x] Fetch the initial state and block from a beacon node HTTP endpoint
- [x] Don't crash fetching beacon states by slot from the API
- [x] Background service for state reconstruction, triggered by CLI flag or API call.

Considered out of scope for this PR:

- Drop the requirement to provide the `--checkpoint-block` (this would require some pretty heavy refactoring of block verification)


Co-authored-by: Diva M <divma@protonmail.com>
This commit is contained in:
Michael Sproul
2021-09-22 00:37:28 +00:00
parent 280e4fe23d
commit 9667dc2f03
71 changed files with 4012 additions and 459 deletions

View File

@@ -14,15 +14,34 @@ const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 5;
/// after `MAX_BATCH_PROCESSING_ATTEMPTS` times, it is considered faulty.
const MAX_BATCH_PROCESSING_ATTEMPTS: u8 = 3;
/// Allows customisation of the above constants used in other sync methods such as BackFillSync.
pub trait BatchConfig {
/// The maximum batch download attempts.
fn max_batch_download_attempts() -> u8;
/// The max batch processing attempts.
fn max_batch_processing_attempts() -> u8;
}
pub struct RangeSyncBatchConfig {}
impl BatchConfig for RangeSyncBatchConfig {
fn max_batch_download_attempts() -> u8 {
MAX_BATCH_DOWNLOAD_ATTEMPTS
}
fn max_batch_processing_attempts() -> u8 {
MAX_BATCH_PROCESSING_ATTEMPTS
}
}
/// Error type of a batch in a wrong state.
// Such errors should never be encountered.
pub struct WrongState(pub(super) String);
pub struct WrongState(pub(crate) String);
/// Auxiliary type alias for readability.
type IsFailed = bool;
/// A segment of a chain.
pub struct BatchInfo<T: EthSpec> {
pub struct BatchInfo<T: EthSpec, B: BatchConfig = RangeSyncBatchConfig> {
/// Start slot of the batch.
start_slot: Slot,
/// End slot of the batch.
@@ -33,6 +52,8 @@ pub struct BatchInfo<T: EthSpec> {
failed_download_attempts: Vec<PeerId>,
/// State of the batch.
state: BatchState<T>,
/// Pin the generic
marker: std::marker::PhantomData<B>,
}
/// Current state of a batch
@@ -73,7 +94,7 @@ impl<T: EthSpec> BatchState<T> {
}
}
impl<T: EthSpec> BatchInfo<T> {
impl<T: EthSpec, B: BatchConfig> BatchInfo<T, B> {
/// Batches are downloaded excluding the first block of the epoch assuming it has already been
/// downloaded.
///
@@ -91,6 +112,7 @@ impl<T: EthSpec> BatchInfo<T> {
failed_processing_attempts: Vec::new(),
failed_download_attempts: Vec::new(),
state: BatchState::AwaitingDownload,
marker: std::marker::PhantomData,
}
}
@@ -120,6 +142,7 @@ impl<T: EthSpec> BatchInfo<T> {
false
}
/// Returns the peer that is currently responsible for progressing the state of the batch.
pub fn current_peer(&self) -> Option<&PeerId> {
match &self.state {
BatchState::AwaitingDownload | BatchState::Failed => None,
@@ -131,6 +154,7 @@ impl<T: EthSpec> BatchInfo<T> {
}
}
/// Returns a BlocksByRange request associated with the batch.
pub fn to_blocks_by_range_request(&self) -> BlocksByRangeRequest {
BlocksByRangeRequest {
start_slot: self.start_slot.into(),
@@ -192,7 +216,7 @@ impl<T: EthSpec> BatchInfo<T> {
// can be tried again
self.failed_download_attempts.push(peer);
self.state = if self.failed_download_attempts.len()
>= MAX_BATCH_DOWNLOAD_ATTEMPTS as usize
>= B::max_batch_download_attempts() as usize
{
BatchState::Failed
} else {
@@ -219,14 +243,21 @@ impl<T: EthSpec> BatchInfo<T> {
}
}
/// Mark the batch as failed and return whether we can attempt a re-download.
///
/// This can happen if a peer disconnects or some error occurred that was not the peers fault.
/// THe `mark_failed` parameter, when set to false, does not increment the failed attempts of
/// this batch and register the peer, rather attempts a re-download.
#[must_use = "Batch may have failed"]
pub fn download_failed(&mut self) -> Result<IsFailed, WrongState> {
pub fn download_failed(&mut self, mark_failed: bool) -> Result<IsFailed, WrongState> {
match self.state.poison() {
BatchState::Downloading(peer, _, _request_id) => {
// register the attempt and check if the batch can be tried again
self.failed_download_attempts.push(peer);
if mark_failed {
self.failed_download_attempts.push(peer);
}
self.state = if self.failed_download_attempts.len()
>= MAX_BATCH_DOWNLOAD_ATTEMPTS as usize
>= B::max_batch_download_attempts as usize
{
BatchState::Failed
} else {
@@ -294,7 +325,7 @@ impl<T: EthSpec> BatchInfo<T> {
// check if the batch can be downloaded again
if self.failed_processing_attempts.len()
>= MAX_BATCH_PROCESSING_ATTEMPTS as usize
>= B::max_batch_processing_attempts() as usize
{
BatchState::Failed
} else {
@@ -324,7 +355,7 @@ impl<T: EthSpec> BatchInfo<T> {
// check if the batch can be downloaded again
self.state = if self.failed_processing_attempts.len()
>= MAX_BATCH_PROCESSING_ATTEMPTS as usize
>= B::max_batch_processing_attempts() as usize
{
BatchState::Failed
} else {
@@ -365,7 +396,7 @@ impl Attempt {
}
}
impl<T: EthSpec> slog::KV for &mut BatchInfo<T> {
impl<T: EthSpec, B: BatchConfig> slog::KV for &mut BatchInfo<T, B> {
fn serialize(
&self,
record: &slog::Record,
@@ -375,7 +406,7 @@ impl<T: EthSpec> slog::KV for &mut BatchInfo<T> {
}
}
impl<T: EthSpec> slog::KV for BatchInfo<T> {
impl<T: EthSpec, B: BatchConfig> slog::KV for BatchInfo<T, B> {
fn serialize(
&self,
record: &slog::Record,

View File

@@ -181,7 +181,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// fail the batches
for id in batch_ids {
if let Some(batch) = self.batches.get_mut(&id) {
if batch.download_failed()? {
if batch.download_failed(true)? {
return Err(RemoveChain::ChainFailed(id));
}
self.retry_batch_download(network, id)?;
@@ -273,7 +273,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
}
}
/// Sends to process the batch with the given id.
/// Processes the batch with the given id.
/// The batch must exist and be ready for processing
fn process_batch(
&mut self,
@@ -794,7 +794,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
if let Some(active_requests) = self.peers.get_mut(peer_id) {
active_requests.remove(&batch_id);
}
if batch.download_failed()? {
if batch.download_failed(true)? {
return Err(RemoveChain::ChainFailed(batch_id));
}
self.retry_batch_download(network, batch_id)
@@ -837,7 +837,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
}
}
/// Requests the batch asigned to the given id from a given peer.
/// Requests the batch assigned to the given id from a given peer.
pub fn send_batch(
&mut self,
network: &mut SyncNetworkContext<T::EthSpec>,
@@ -883,7 +883,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
self.peers
.get_mut(&peer)
.map(|request| request.remove(&batch_id));
if batch.download_failed()? {
if batch.download_failed(true)? {
return Err(RemoveChain::ChainFailed(batch_id));
} else {
return self.retry_batch_download(network, batch_id);
@@ -990,7 +990,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// this batch could have been included already being an optimistic batch
match self.batches.entry(batch_id) {
Entry::Occupied(_) => {
// this batch doesn't need downlading, let this same function decide the next batch
// this batch doesn't need downloading, let this same function decide the next batch
self.to_be_downloaded += EPOCHS_PER_BATCH;
self.include_next_batch()
}

View File

@@ -7,7 +7,7 @@ mod chain_collection;
mod range;
mod sync_type;
pub use batch::BatchInfo;
pub use batch::{BatchConfig, BatchInfo, BatchState};
pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH};
pub use range::RangeSync;
pub use sync_type::RangeSyncType;

View File

@@ -39,7 +39,7 @@
//! Each chain is downloaded in batches of blocks. The batched blocks are processed sequentially
//! and further batches are requested as current blocks are being processed.
use super::chain::{ChainId, RemoveChain, SyncingChain};
use super::chain::{BatchId, ChainId, RemoveChain, SyncingChain};
use super::chain_collection::ChainCollection;
use super::sync_type::RangeSyncType;
use crate::beacon_processor::WorkEvent as BeaconWorkEvent;
@@ -194,34 +194,29 @@ impl<T: BeaconChainTypes> RangeSync<T> {
&mut self,
network: &mut SyncNetworkContext<T::EthSpec>,
peer_id: PeerId,
chain_id: ChainId,
batch_id: BatchId,
request_id: RequestId,
beacon_block: Option<SignedBeaconBlock<T::EthSpec>>,
) {
// get the chain and batch for which this response belongs
if let Some((chain_id, batch_id)) =
network.blocks_by_range_response(request_id, beacon_block.is_none())
{
// check if this chunk removes the chain
match self.chains.call_by_id(chain_id, |chain| {
chain.on_block_response(network, batch_id, &peer_id, request_id, beacon_block)
}) {
Ok((removed_chain, sync_type)) => {
if let Some((removed_chain, remove_reason)) = removed_chain {
self.on_chain_removed(
removed_chain,
sync_type,
remove_reason,
network,
"block response",
);
}
}
Err(_) => {
trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
// check if this chunk removes the chain
match self.chains.call_by_id(chain_id, |chain| {
chain.on_block_response(network, batch_id, &peer_id, request_id, beacon_block)
}) {
Ok((removed_chain, sync_type)) => {
if let Some((removed_chain, remove_reason)) = removed_chain {
self.on_chain_removed(
removed_chain,
sync_type,
remove_reason,
network,
"block response",
);
}
}
} else {
trace!(self.log, "Response/Error for non registered request"; "request_id" => request_id)
Err(_) => {
trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
}
}
}
@@ -298,31 +293,28 @@ impl<T: BeaconChainTypes> RangeSync<T> {
&mut self,
network: &mut SyncNetworkContext<T::EthSpec>,
peer_id: PeerId,
batch_id: BatchId,
chain_id: ChainId,
request_id: RequestId,
) {
// get the chain and batch for which this response belongs
if let Some((chain_id, batch_id)) = network.blocks_by_range_response(request_id, true) {
// check that this request is pending
match self.chains.call_by_id(chain_id, |chain| {
chain.inject_error(network, batch_id, &peer_id, request_id)
}) {
Ok((removed_chain, sync_type)) => {
if let Some((removed_chain, remove_reason)) = removed_chain {
self.on_chain_removed(
removed_chain,
sync_type,
remove_reason,
network,
"RPC error",
);
}
}
Err(_) => {
trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
// check that this request is pending
match self.chains.call_by_id(chain_id, |chain| {
chain.inject_error(network, batch_id, &peer_id, request_id)
}) {
Ok((removed_chain, sync_type)) => {
if let Some((removed_chain, remove_reason)) = removed_chain {
self.on_chain_removed(
removed_chain,
sync_type,
remove_reason,
network,
"RPC error",
);
}
}
} else {
trace!(self.log, "Response/Error for non registered request"; "request_id" => request_id)
Err(_) => {
trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
}
}
}