Implement checkpoint sync (#2244)

## Issue Addressed

Closes #1891
Closes #1784

## Proposed Changes

Implement checkpoint sync for Lighthouse, enabling it to start from a weak subjectivity checkpoint.

## Additional Info

- [x] Return unavailable status for out-of-range blocks requested by peers (#2561)
- [x] Implement sync daemon for fetching historical blocks (#2561)
- [x] Verify chain hashes (either in `historical_blocks.rs` or the calling module)
- [x] Consistency check for initial block + state
- [x] Fetch the initial state and block from a beacon node HTTP endpoint
- [x] Don't crash fetching beacon states by slot from the API
- [x] Background service for state reconstruction, triggered by CLI flag or API call.

Considered out of scope for this PR:

- Drop the requirement to provide the `--checkpoint-block` (this would require some pretty heavy refactoring of block verification)


Co-authored-by: Diva M <divma@protonmail.com>
This commit is contained in:
Michael Sproul
2021-09-22 00:37:28 +00:00
parent 280e4fe23d
commit 9667dc2f03
71 changed files with 4012 additions and 459 deletions

View File

@@ -522,7 +522,7 @@ impl<TSpec: EthSpec> Behaviour<TSpec> {
}
/// Inform the peer that their request produced an error.
pub fn _send_error_reponse(
pub fn send_error_reponse(
&mut self,
peer_id: PeerId,
id: PeerRequestId,

View File

@@ -578,8 +578,17 @@ impl<TSpec: EthSpec> PeerManager<TSpec> {
RPCResponseErrorCode::Unknown => PeerAction::HighToleranceError,
RPCResponseErrorCode::ResourceUnavailable => {
// NOTE: This error only makes sense for the `BlocksByRange` and `BlocksByRoot`
// protocols. For the time being, there is no reason why a peer should send
// this error.
// protocols.
//
// If we are syncing, there is no point keeping these peers around and
// continually failing to request blocks. We instantly ban them and hope that
// by the time the ban lifts, the peers will have completed their backfill
// sync.
//
// TODO: Potentially a more graceful way of handling such peers, would be to
// implement a new sync type which tracks these peers and prevents the sync
// algorithms from requesting blocks from them (at least for a set period of
// time, multiple failures would then lead to a ban).
PeerAction::Fatal
}
RPCResponseErrorCode::ServerError => PeerAction::MidToleranceError,

View File

@@ -211,16 +211,13 @@ mod tests {
let _ = snappy_buf.split_to(1);
// decode message just as snappy message
let snappy_decoded_message = snappy_outbound_codec.decode(&mut snappy_buf).unwrap();
let _snappy_decoded_message = snappy_outbound_codec.decode(&mut snappy_buf).unwrap();
// build codecs for entire chunk
let mut snappy_base_outbound_codec = BaseOutboundCodec::new(snappy_outbound_codec);
// decode message as ssz snappy chunk
let snappy_decoded_chunk = snappy_base_outbound_codec.decode(&mut buf).unwrap();
dbg!(snappy_decoded_message);
dbg!(snappy_decoded_chunk);
let _snappy_decoded_chunk = snappy_base_outbound_codec.decode(&mut buf).unwrap();
}
#[test]

View File

@@ -275,7 +275,7 @@ impl<TSpec: EthSpec> Service<TSpec> {
) {
self.swarm
.behaviour_mut()
._send_error_reponse(peer_id, id, error, reason);
.send_error_reponse(peer_id, id, error, reason);
}
/// Report a peer's action.

View File

@@ -1,7 +1,7 @@
//! A collection of variables that are accessible outside of the network thread itself.
use crate::peer_manager::PeerDB;
use crate::rpc::MetaData;
use crate::types::SyncState;
use crate::types::{BackFillState, SyncState};
use crate::Client;
use crate::EnrExt;
use crate::{Enr, GossipTopic, Multiaddr, PeerId};
@@ -29,6 +29,8 @@ pub struct NetworkGlobals<TSpec: EthSpec> {
pub gossipsub_subscriptions: RwLock<HashSet<GossipTopic>>,
/// The current sync status of the node.
pub sync_state: RwLock<SyncState>,
/// The current state of the backfill sync.
pub backfill_state: RwLock<BackFillState>,
}
impl<TSpec: EthSpec> NetworkGlobals<TSpec> {
@@ -50,6 +52,7 @@ impl<TSpec: EthSpec> NetworkGlobals<TSpec> {
peers: RwLock::new(PeerDB::new(trusted_peers, log)),
gossipsub_subscriptions: RwLock::new(HashSet::new()),
sync_state: RwLock::new(SyncState::Stalled),
backfill_state: RwLock::new(BackFillState::NotRequired),
}
}
@@ -104,6 +107,11 @@ impl<TSpec: EthSpec> NetworkGlobals<TSpec> {
self.sync_state.read().clone()
}
/// Returns the current backfill state.
pub fn backfill_state(&self) -> BackFillState {
self.backfill_state.read().clone()
}
/// Returns a `Client` type if one is known for the `PeerId`.
pub fn client(&self, peer_id: &PeerId) -> Client {
self.peers

View File

@@ -15,5 +15,5 @@ pub type Enr = discv5::enr::Enr<discv5::enr::CombinedKey>;
pub use globals::NetworkGlobals;
pub use pubsub::{PubsubMessage, SnappyTransform};
pub use subnet::{Subnet, SubnetDiscovery};
pub use sync_state::SyncState;
pub use sync_state::{BackFillState, SyncState};
pub use topics::{subnet_from_topic_hash, GossipEncoding, GossipKind, GossipTopic, CORE_TOPICS};

View File

@@ -10,8 +10,13 @@ pub enum SyncState {
/// The node is performing a long-range (batch) sync over one or many head chains.
/// In this state parent lookups are disabled.
SyncingHead { start_slot: Slot, target_slot: Slot },
/// The node has identified the need for is sync operations and is transitioning to a syncing
/// state.
/// The node is undertaking a backfill sync. This occurs when a user has specified a trusted
/// state. The node first syncs "forward" by downloading blocks up to the current head as
/// specified by its peers. Once completed, the node enters this sync state and attempts to
/// download all required historical blocks to complete its chain.
BackFillSyncing { completed: usize, remaining: usize },
/// The node has completed syncing a finalized chain and is in the process of re-evaluating
/// which sync state to progress to.
SyncTransition,
/// The node is up to date with all known peers and is connected to at least one
/// fully synced peer. In this state, parent lookups are enabled.
@@ -21,6 +26,21 @@ pub enum SyncState {
Stalled,
}
#[derive(PartialEq, Debug, Clone, Serialize, Deserialize)]
/// The state of the backfill sync.
pub enum BackFillState {
/// The sync is partially completed and currently paused.
Paused,
/// We are currently backfilling.
Syncing,
/// A backfill sync has completed.
Completed,
/// A backfill sync is not required.
NotRequired,
/// Too many failed attempts at backfilling. Consider it failed.
Failed,
}
impl PartialEq for SyncState {
fn eq(&self, other: &Self) -> bool {
matches!(
@@ -32,6 +52,10 @@ impl PartialEq for SyncState {
| (SyncState::Synced, SyncState::Synced)
| (SyncState::Stalled, SyncState::Stalled)
| (SyncState::SyncTransition, SyncState::SyncTransition)
| (
SyncState::BackFillSyncing { .. },
SyncState::BackFillSyncing { .. }
)
)
}
}
@@ -43,14 +67,18 @@ impl SyncState {
SyncState::SyncingFinalized { .. } => true,
SyncState::SyncingHead { .. } => true,
SyncState::SyncTransition => true,
// Backfill doesn't effect any logic, we consider this state, not syncing.
SyncState::BackFillSyncing { .. } => false,
SyncState::Synced => false,
SyncState::Stalled => false,
}
}
/// Returns true if the node is synced.
///
/// NOTE: We consider the node synced if it is fetching old historical blocks.
pub fn is_synced(&self) -> bool {
matches!(self, SyncState::Synced)
matches!(self, SyncState::Synced | SyncState::BackFillSyncing { .. })
}
}
@@ -61,7 +89,8 @@ impl std::fmt::Display for SyncState {
SyncState::SyncingHead { .. } => write!(f, "Syncing Head Chain"),
SyncState::Synced { .. } => write!(f, "Synced"),
SyncState::Stalled { .. } => write!(f, "Stalled"),
SyncState::SyncTransition => write!(f, "Searching syncing peers"),
SyncState::SyncTransition => write!(f, "Evaluating known peers"),
SyncState::BackFillSyncing { .. } => write!(f, "Syncing Historical Blocks"),
}
}
}