mirror of
https://github.com/sigp/lighthouse.git
synced 2026-07-04 05:14:33 +00:00
Implement backfill sync restart
This commit is contained in:
@@ -156,6 +156,10 @@ impl<E: EthSpec> NetworkGlobals<E> {
|
|||||||
.add_latest_update((update_start_slot, cgc))
|
.add_latest_update((update_start_slot, cgc))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn prune_cgc_updates_older_than(&self, slot: Slot) {
|
||||||
|
self.cgc_updates.write().prune_updates_older_than(slot);
|
||||||
|
}
|
||||||
|
|
||||||
pub fn dump_cgc_updates(&self) -> CGCUpdates {
|
pub fn dump_cgc_updates(&self) -> CGCUpdates {
|
||||||
self.cgc_updates.read().clone()
|
self.cgc_updates.read().clone()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
|||||||
use tokio::sync::mpsc;
|
use tokio::sync::mpsc;
|
||||||
use tokio_stream::wrappers::UnboundedReceiverStream;
|
use tokio_stream::wrappers::UnboundedReceiverStream;
|
||||||
use tracing::{debug, error, info_span, trace, warn, Instrument};
|
use tracing::{debug, error, info_span, trace, warn, Instrument};
|
||||||
use types::{BlobSidecar, DataColumnSidecar, EthSpec, ForkContext, SignedBeaconBlock};
|
use types::{BlobSidecar, DataColumnSidecar, EthSpec, ForkContext, SignedBeaconBlock, Slot};
|
||||||
|
|
||||||
/// Handles messages from the network and routes them to the appropriate service to be handled.
|
/// Handles messages from the network and routes them to the appropriate service to be handled.
|
||||||
pub struct Router<T: BeaconChainTypes> {
|
pub struct Router<T: BeaconChainTypes> {
|
||||||
@@ -75,6 +75,8 @@ pub enum RouterMessage<E: EthSpec> {
|
|||||||
PubsubMessage(MessageId, PeerId, PubsubMessage<E>, bool),
|
PubsubMessage(MessageId, PeerId, PubsubMessage<E>, bool),
|
||||||
/// The peer manager has requested we re-status a peer.
|
/// The peer manager has requested we re-status a peer.
|
||||||
StatusPeer(PeerId),
|
StatusPeer(PeerId),
|
||||||
|
/// Trigger backfill sync restart
|
||||||
|
BackfillSyncRestart(Slot),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: BeaconChainTypes> Router<T> {
|
impl<T: BeaconChainTypes> Router<T> {
|
||||||
@@ -181,6 +183,9 @@ impl<T: BeaconChainTypes> Router<T> {
|
|||||||
RouterMessage::PubsubMessage(id, peer_id, gossip, should_process) => {
|
RouterMessage::PubsubMessage(id, peer_id, gossip, should_process) => {
|
||||||
self.handle_gossip(id, peer_id, gossip, should_process);
|
self.handle_gossip(id, peer_id, gossip, should_process);
|
||||||
}
|
}
|
||||||
|
RouterMessage::BackfillSyncRestart(slot) => {
|
||||||
|
self.send_to_sync(SyncMessage::BackfillSyncRestart(slot));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ use tokio::sync::mpsc;
|
|||||||
use tokio::time::Sleep;
|
use tokio::time::Sleep;
|
||||||
use tracing::{debug, error, info, info_span, trace, warn, Instrument};
|
use tracing::{debug, error, info, info_span, trace, warn, Instrument};
|
||||||
use types::{
|
use types::{
|
||||||
ChainSpec, Epoch, EthSpec, ForkContext, Slot, SubnetId, SyncCommitteeSubscription,
|
ChainSpec, Epoch, EthSpec, ForkContext, ForkName, Slot, SubnetId, SyncCommitteeSubscription,
|
||||||
SyncSubnetId, Unsigned, ValidatorSubscription,
|
SyncSubnetId, Unsigned, ValidatorSubscription,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -850,6 +850,14 @@ impl<T: BeaconChainTypes> NetworkService<T> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn on_cgc_update_interval(&mut self) {
|
fn on_cgc_update_interval(&mut self) {
|
||||||
|
// Skip running this function if Fulu is not scheduled. But run it before the fork to start
|
||||||
|
// announcing the CGC ahead of the fork.
|
||||||
|
let fulu_fork_epoch = match self.beacon_chain.spec.fork_epoch(ForkName::Fulu) {
|
||||||
|
None => return,
|
||||||
|
Some(epoch) if epoch == Epoch::max_value() => return,
|
||||||
|
Some(epoch) => epoch,
|
||||||
|
};
|
||||||
|
|
||||||
let prev_cgc = self.network_globals.custody_group_count(Slot::max_value());
|
let prev_cgc = self.network_globals.custody_group_count(Slot::max_value());
|
||||||
let Ok(clock_epoch) = self.beacon_chain.epoch() else {
|
let Ok(clock_epoch) = self.beacon_chain.epoch() else {
|
||||||
return;
|
return;
|
||||||
@@ -973,33 +981,47 @@ impl<T: BeaconChainTypes> NetworkService<T> {
|
|||||||
// with CGC 128
|
// with CGC 128
|
||||||
//
|
//
|
||||||
let oldest_block_slot = self.beacon_chain.store.get_anchor_info().oldest_block_slot;
|
let oldest_block_slot = self.beacon_chain.store.get_anchor_info().oldest_block_slot;
|
||||||
|
// TODO(das): use min_epochs_for_data_columns
|
||||||
|
let last_pruned_epoch = clock_epoch.saturating_sub(Epoch::new(
|
||||||
|
self.beacon_chain.spec.min_epochs_for_blob_sidecars_requests,
|
||||||
|
));
|
||||||
|
let last_pruned_slot = last_pruned_epoch.start_slot(T::EthSpec::slots_per_epoch());
|
||||||
|
let fulu_fork_slot = fulu_fork_epoch.start_slot(T::EthSpec::slots_per_epoch());
|
||||||
|
let oldest_relevant_slot = std::cmp::max(
|
||||||
|
oldest_block_slot,
|
||||||
|
std::cmp::max(last_pruned_slot, fulu_fork_slot),
|
||||||
|
);
|
||||||
|
|
||||||
let finalized_slot = self.beacon_chain.finalized_slot();
|
let finalized_slot = self.beacon_chain.finalized_slot();
|
||||||
let cgc_at_oldest_block_slot = self.network_globals.custody_group_count(oldest_block_slot);
|
|
||||||
|
let cgc_at_oldest_relevant_slot = self
|
||||||
|
.network_globals
|
||||||
|
.custody_group_count(oldest_relevant_slot);
|
||||||
let cgc_at_finalized_slot = self.network_globals.custody_group_count(finalized_slot);
|
let cgc_at_finalized_slot = self.network_globals.custody_group_count(finalized_slot);
|
||||||
|
|
||||||
|
let backfill_started_recently =
|
||||||
|
finalized_slot.saturating_sub(oldest_block_slot) < MAX_SLOT_DISTANCE_BACKFILL_RESTART;
|
||||||
|
let backfill_finished = oldest_block_slot == Slot::new(0);
|
||||||
|
|
||||||
// TODO(das): If we support a decreasing CGC we must consider the min value between this two
|
// TODO(das): If we support a decreasing CGC we must consider the min value between this two
|
||||||
// slots.
|
// slots.
|
||||||
if cgc_at_oldest_block_slot < cgc_at_finalized_slot {
|
//
|
||||||
let backfill_started_recently = finalized_slot.saturating_sub(oldest_block_slot)
|
// Skip if backfill has finished. State reconstruction may have already started and we could
|
||||||
< MAX_SLOT_DISTANCE_BACKFILL_RESTART;
|
// mess with the DB. For real networks Fulu fork is way ahead of genesis so it won't affect
|
||||||
// Note: we don't check if backfill finished. If it did because we are close to genesis,
|
if cgc_at_oldest_relevant_slot < cgc_at_finalized_slot
|
||||||
// we want to restart it anyway to backfill with the CGC. The only condition to NOT
|
&& backfill_started_recently
|
||||||
// restart is if backfill went too far and thus we would waste too much bandwidth
|
&& !backfill_finished
|
||||||
// fetching the blocks again.
|
{
|
||||||
if backfill_started_recently {
|
// We need backfill sync to fetch batches with `CGC_f = cgc_at_finalized_slot`. Then
|
||||||
// We need backfill sync to fetch batches with `CGC_f = cgc_at_finalized_slot`. Then
|
// `custody_group_count(oldest_block_slot) should now return `CGC_f`. So we have to
|
||||||
// `custody_group_count(oldest_block_slot) should now return `CGC_f`. So we have to
|
// delete the CGC updates with `update.slot < finalized_slot`
|
||||||
// delete the CGC updates with `update.slot < finalized_slot`
|
self.network_globals
|
||||||
todo!();
|
.prune_cgc_updates_older_than(finalized_slot);
|
||||||
}
|
self.send_to_router(RouterMessage::BackfillSyncRestart(finalized_slot));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Schedule an advertise CGC update for later
|
// Schedule an advertise CGC update for later
|
||||||
// TODO(das): use min_epochs_for_data_columns
|
let cgc_to_announce = cgc_at_oldest_relevant_slot;
|
||||||
let last_pruned_epoch =
|
|
||||||
clock_epoch - Epoch::new(self.beacon_chain.spec.min_epochs_for_blob_sidecars_requests);
|
|
||||||
let cgc_to_announce = self
|
|
||||||
.network_globals
|
|
||||||
.custody_group_count(last_pruned_epoch.start_slot(T::EthSpec::slots_per_epoch()));
|
|
||||||
|
|
||||||
// update_enr_cgc updates the NetworkGlobals ENR
|
// update_enr_cgc updates the NetworkGlobals ENR
|
||||||
match self.libp2p.update_enr_cgc(cgc_to_announce) {
|
match self.libp2p.update_enr_cgc(cgc_to_announce) {
|
||||||
|
|||||||
@@ -208,6 +208,37 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Restarts backfill backfill sync clearing its state
|
||||||
|
#[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"]
|
||||||
|
#[instrument(parent = None,
|
||||||
|
level = "info",
|
||||||
|
fields(service = "backfill_sync"),
|
||||||
|
name = "backfill_sync",
|
||||||
|
skip_all
|
||||||
|
)]
|
||||||
|
pub fn restart(
|
||||||
|
&mut self,
|
||||||
|
network: &mut SyncNetworkContext<T>,
|
||||||
|
) -> Result<SyncStart, BackFillError> {
|
||||||
|
match self.state() {
|
||||||
|
// Reset and start again
|
||||||
|
BackFillState::Syncing => {
|
||||||
|
self.reset_sync();
|
||||||
|
self.set_state(BackFillState::Paused);
|
||||||
|
self.start(network)
|
||||||
|
}
|
||||||
|
// Reset, but keep paused
|
||||||
|
BackFillState::Paused => {
|
||||||
|
self.reset_sync();
|
||||||
|
Ok(SyncStart::NotSyncing)
|
||||||
|
}
|
||||||
|
// Ignore a restart if completed
|
||||||
|
BackFillState::Completed => Ok(SyncStart::NotSyncing),
|
||||||
|
// Already reset, no need to do anything
|
||||||
|
BackFillState::Failed => Ok(SyncStart::NotSyncing),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Starts or resumes syncing.
|
/// Starts or resumes syncing.
|
||||||
///
|
///
|
||||||
/// If resuming is successful, reports back the current syncing metrics.
|
/// If resuming is successful, reports back the current syncing metrics.
|
||||||
@@ -486,6 +517,24 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
|||||||
|
|
||||||
// Set the state
|
// Set the state
|
||||||
self.set_state(BackFillState::Failed);
|
self.set_state(BackFillState::Failed);
|
||||||
|
self.reset_sync();
|
||||||
|
|
||||||
|
// Emit the log here
|
||||||
|
error!(?error, "Backfill sync failed");
|
||||||
|
|
||||||
|
// Return the error, kinda weird pattern, but I want to use
|
||||||
|
// `self.fail_chain(_)?` in other parts of the code.
|
||||||
|
Err(error)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This resets past variables, to allow for a fresh start when resuming.
|
||||||
|
#[instrument(parent = None,
|
||||||
|
level = "info",
|
||||||
|
fields(service = "backfill_sync"),
|
||||||
|
name = "backfill_sync",
|
||||||
|
skip_all
|
||||||
|
)]
|
||||||
|
fn reset_sync(&mut self) {
|
||||||
// Remove all batches and active requests and participating peers.
|
// Remove all batches and active requests and participating peers.
|
||||||
self.batches.clear();
|
self.batches.clear();
|
||||||
self.active_requests.clear();
|
self.active_requests.clear();
|
||||||
@@ -499,13 +548,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
|||||||
self.current_processing_batch = None;
|
self.current_processing_batch = None;
|
||||||
|
|
||||||
// NOTE: Lets keep validated_batches for posterity
|
// NOTE: Lets keep validated_batches for posterity
|
||||||
|
|
||||||
// Emit the log here
|
|
||||||
error!(?error, "Backfill sync failed");
|
|
||||||
|
|
||||||
// Return the error, kinda weird pattern, but I want to use
|
|
||||||
// `self.fail_chain(_)?` in other parts of the code.
|
|
||||||
Err(error)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Processes the batch with the given id.
|
/// Processes the batch with the given id.
|
||||||
|
|||||||
@@ -177,6 +177,9 @@ pub enum SyncMessage<E: EthSpec> {
|
|||||||
|
|
||||||
/// A block from gossip has completed processing,
|
/// A block from gossip has completed processing,
|
||||||
GossipBlockProcessResult { block_root: Hash256, imported: bool },
|
GossipBlockProcessResult { block_root: Hash256, imported: bool },
|
||||||
|
|
||||||
|
/// Network service asks backfill sync to restart after increasing the oldest_block_slot
|
||||||
|
BackfillSyncRestart(Slot),
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The type of processing specified for a received block.
|
/// The type of processing specified for a received block.
|
||||||
@@ -896,6 +899,13 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
|||||||
self.on_sampling_result(requester, result)
|
self.on_sampling_result(requester, result)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
SyncMessage::BackfillSyncRestart(slot) => {
|
||||||
|
if let Err(e) = self.backfill_sync.restart(&mut self.network) {
|
||||||
|
error!(error = ?e, "Error on backfill sync restart");
|
||||||
|
} else {
|
||||||
|
debug!(%slot, "Received backfill sync restart event");
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -34,6 +34,10 @@ impl CGCUpdates {
|
|||||||
.map_err(|e| format!("Updates list full: {e:?}"))
|
.map_err(|e| format!("Updates list full: {e:?}"))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn prune_updates_older_than(&mut self, slot: Slot) {
|
||||||
|
todo!("{slot}");
|
||||||
|
}
|
||||||
|
|
||||||
pub fn iter(&self) -> impl Iterator<Item = (Slot, u64)> + '_ {
|
pub fn iter(&self) -> impl Iterator<Item = (Slot, u64)> + '_ {
|
||||||
std::iter::once((Slot::new(0), self.initial_value)).chain(self.updates.iter().copied())
|
std::iter::once((Slot::new(0), self.initial_value)).chain(self.updates.iter().copied())
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user