mirror of
https://github.com/sigp/lighthouse.git
synced 2026-05-31 13:17:09 +00:00
Fix bug in database pruning (#1564)
## Issue Addressed Closes #1488 ## Proposed Changes * Prevent the pruning algorithm from over-eagerly deleting states at skipped slots when they are shared with the canonical chain. * Add `debug` logging to the pruning algorithm so we have so better chance of debugging future issues from logs. * Modify the handling of the "finalized state" in the beacon chain, so that it's always the state at the first slot of the finalized epoch (previously it was the state at the finalized block). This gives database pruning a clearer and cleaner view of things, and will marginally impact the pruning of the op pool, observed proposers, etc (in ways that are safe as far as I can tell). * Remove duplicated `RevertedFinalizedEpoch` check from `after_finalization` * Delete useless and unused `max_finality_distance` * Add tests that exercise pruning with shared states at skip slots * Delete unnecessary `block_strategy` argument from `add_blocks` and friends in the test harness (will likely conflict with #1380 slightly, sorry @adaszko -- but we can fix that) * Bonus: add a `BeaconChain::with_head` method. I didn't end up needing it, but it turned out quite nice, so I figured we could keep it? ## Additional Info Any users who have experienced pruning errors on Medalla will need to resync after upgrading to a release including this change. This should end unbounded `chain_db` growth! 🎉
This commit is contained in:
@@ -1,18 +1,34 @@
|
||||
use crate::errors::BeaconChainError;
|
||||
use crate::head_tracker::HeadTracker;
|
||||
use parking_lot::Mutex;
|
||||
use slog::{debug, warn, Logger};
|
||||
use slog::{debug, error, warn, Logger};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::mem;
|
||||
use std::sync::mpsc;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use store::hot_cold_store::{process_finalization, HotColdDBError};
|
||||
use store::iter::{ParentRootBlockIterator, RootsIterator};
|
||||
use store::iter::RootsIterator;
|
||||
use store::{Error, ItemStore, StoreOp};
|
||||
pub use store::{HotColdDB, MemoryStore};
|
||||
use types::*;
|
||||
use types::{BeaconState, EthSpec, Hash256, Slot};
|
||||
use types::{
|
||||
BeaconState, BeaconStateError, BeaconStateHash, Checkpoint, EthSpec, Hash256,
|
||||
SignedBeaconBlockHash, Slot,
|
||||
};
|
||||
|
||||
/// Logic errors that can occur during pruning, none of these should ever happen.
|
||||
#[derive(Debug)]
|
||||
pub enum PruningError {
|
||||
IncorrectFinalizedState {
|
||||
state_slot: Slot,
|
||||
new_finalized_slot: Slot,
|
||||
},
|
||||
MissingInfoForCanonicalChain {
|
||||
slot: Slot,
|
||||
},
|
||||
UnexpectedEqualStateRoots,
|
||||
UnexpectedUnequalStateRoots,
|
||||
}
|
||||
|
||||
/// Trait for migration processes that update the database upon finalization.
|
||||
pub trait Migrate<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>>:
|
||||
@@ -22,17 +38,16 @@ pub trait Migrate<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>>:
|
||||
|
||||
fn process_finalization(
|
||||
&self,
|
||||
_state_root: Hash256,
|
||||
_finalized_state_root: BeaconStateHash,
|
||||
_new_finalized_state: BeaconState<E>,
|
||||
_max_finality_distance: u64,
|
||||
_head_tracker: Arc<HeadTracker>,
|
||||
_old_finalized_block_hash: SignedBeaconBlockHash,
|
||||
_new_finalized_block_hash: SignedBeaconBlockHash,
|
||||
_old_finalized_checkpoint: Checkpoint,
|
||||
_new_finalized_checkpoint: Checkpoint,
|
||||
) {
|
||||
}
|
||||
|
||||
/// Traverses live heads and prunes blocks and states of chains that we know can't be built
|
||||
/// upon because finalization would prohibit it. This is an optimisation intended to save disk
|
||||
/// upon because finalization would prohibit it. This is an optimisation intended to save disk
|
||||
/// space.
|
||||
///
|
||||
/// Assumptions:
|
||||
@@ -40,37 +55,63 @@ pub trait Migrate<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>>:
|
||||
fn prune_abandoned_forks(
|
||||
store: Arc<HotColdDB<E, Hot, Cold>>,
|
||||
head_tracker: Arc<HeadTracker>,
|
||||
old_finalized_block_hash: SignedBeaconBlockHash,
|
||||
new_finalized_block_hash: SignedBeaconBlockHash,
|
||||
new_finalized_slot: Slot,
|
||||
new_finalized_state_hash: BeaconStateHash,
|
||||
new_finalized_state: &BeaconState<E>,
|
||||
old_finalized_checkpoint: Checkpoint,
|
||||
new_finalized_checkpoint: Checkpoint,
|
||||
log: &Logger,
|
||||
) -> Result<(), BeaconChainError> {
|
||||
// There will never be any blocks to prune if there is only a single head in the chain.
|
||||
if head_tracker.heads().len() == 1 {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let old_finalized_slot = store
|
||||
.get_block(&old_finalized_block_hash.into())?
|
||||
.ok_or_else(|| BeaconChainError::MissingBeaconBlock(old_finalized_block_hash.into()))?
|
||||
.slot();
|
||||
let old_finalized_slot = old_finalized_checkpoint
|
||||
.epoch
|
||||
.start_slot(E::slots_per_epoch());
|
||||
let new_finalized_slot = new_finalized_checkpoint
|
||||
.epoch
|
||||
.start_slot(E::slots_per_epoch());
|
||||
let new_finalized_block_hash = new_finalized_checkpoint.root.into();
|
||||
|
||||
// Collect hashes from new_finalized_block back to old_finalized_block (inclusive)
|
||||
let mut found_block = false; // hack for `take_until`
|
||||
let newly_finalized_blocks: HashMap<SignedBeaconBlockHash, Slot> =
|
||||
ParentRootBlockIterator::new(&*store, new_finalized_block_hash.into())
|
||||
.take_while(|result| match result {
|
||||
Ok((block_hash, _)) => {
|
||||
if found_block {
|
||||
false
|
||||
} else {
|
||||
found_block |= *block_hash == old_finalized_block_hash.into();
|
||||
true
|
||||
}
|
||||
}
|
||||
Err(_) => true,
|
||||
})
|
||||
.map(|result| result.map(|(block_hash, block)| (block_hash.into(), block.slot())))
|
||||
.collect::<Result<_, _>>()?;
|
||||
// The finalized state must be for the epoch boundary slot, not the slot of the finalized
|
||||
// block.
|
||||
if new_finalized_state.slot != new_finalized_slot {
|
||||
return Err(PruningError::IncorrectFinalizedState {
|
||||
state_slot: new_finalized_state.slot,
|
||||
new_finalized_slot,
|
||||
}
|
||||
.into());
|
||||
}
|
||||
|
||||
debug!(
|
||||
log,
|
||||
"Starting database pruning";
|
||||
"old_finalized_epoch" => old_finalized_checkpoint.epoch,
|
||||
"old_finalized_root" => format!("{:?}", old_finalized_checkpoint.root),
|
||||
"new_finalized_epoch" => new_finalized_checkpoint.epoch,
|
||||
"new_finalized_root" => format!("{:?}", new_finalized_checkpoint.root),
|
||||
);
|
||||
|
||||
// For each slot between the new finalized checkpoint and the old finalized checkpoint,
|
||||
// collect the beacon block root and state root of the canonical chain.
|
||||
let newly_finalized_chain: HashMap<Slot, (SignedBeaconBlockHash, BeaconStateHash)> =
|
||||
std::iter::once(Ok((
|
||||
new_finalized_slot,
|
||||
(new_finalized_block_hash, new_finalized_state_hash),
|
||||
)))
|
||||
.chain(
|
||||
RootsIterator::new(store.clone(), new_finalized_state).map(|res| {
|
||||
res.map(|(block_root, state_root, slot)| {
|
||||
(slot, (block_root.into(), state_root.into()))
|
||||
})
|
||||
}),
|
||||
)
|
||||
.take_while(|res| {
|
||||
res.as_ref()
|
||||
.map_or(true, |(slot, _)| *slot >= old_finalized_slot)
|
||||
})
|
||||
.collect::<Result<_, _>>()?;
|
||||
|
||||
// We don't know which blocks are shared among abandoned chains, so we buffer and delete
|
||||
// everything in one fell swoop.
|
||||
@@ -79,75 +120,110 @@ pub trait Migrate<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>>:
|
||||
let mut abandoned_heads: HashSet<Hash256> = HashSet::new();
|
||||
|
||||
for (head_hash, head_slot) in head_tracker.heads() {
|
||||
let mut potentially_abandoned_head: Option<Hash256> = Some(head_hash);
|
||||
let mut potentially_abandoned_blocks: Vec<(
|
||||
Slot,
|
||||
Option<SignedBeaconBlockHash>,
|
||||
Option<BeaconStateHash>,
|
||||
)> = Vec::new();
|
||||
let mut potentially_abandoned_head = Some(head_hash);
|
||||
let mut potentially_abandoned_blocks = vec![];
|
||||
|
||||
let head_state_hash = store
|
||||
.get_block(&head_hash)?
|
||||
.ok_or_else(|| BeaconStateError::MissingBeaconBlock(head_hash.into()))?
|
||||
.state_root();
|
||||
|
||||
// Iterate backwards from this head, staging blocks and states for deletion.
|
||||
let iter = std::iter::once(Ok((head_hash, head_state_hash, head_slot)))
|
||||
.chain(RootsIterator::from_block(Arc::clone(&store), head_hash)?);
|
||||
.chain(RootsIterator::from_block(store.clone(), head_hash)?);
|
||||
|
||||
for maybe_tuple in iter {
|
||||
let (block_hash, state_hash, slot) = maybe_tuple?;
|
||||
if slot < old_finalized_slot {
|
||||
// We must assume here any candidate chains include old_finalized_block_hash,
|
||||
// i.e. there aren't any forks starting at a block that is a strict ancestor of
|
||||
// old_finalized_block_hash.
|
||||
break;
|
||||
}
|
||||
match newly_finalized_blocks.get(&block_hash.into()).copied() {
|
||||
// Block is not finalized, mark it and its state for deletion
|
||||
let (block_root, state_root, slot) = maybe_tuple?;
|
||||
let block_root = SignedBeaconBlockHash::from(block_root);
|
||||
let state_root = BeaconStateHash::from(state_root);
|
||||
|
||||
match newly_finalized_chain.get(&slot) {
|
||||
// If there's no information about a slot on the finalized chain, then
|
||||
// it should be because it's ahead of the new finalized slot. Stage
|
||||
// the fork's block and state for possible deletion.
|
||||
None => {
|
||||
potentially_abandoned_blocks.push((
|
||||
slot,
|
||||
Some(block_hash.into()),
|
||||
Some(state_hash.into()),
|
||||
));
|
||||
if slot > new_finalized_slot {
|
||||
potentially_abandoned_blocks.push((
|
||||
slot,
|
||||
Some(block_root),
|
||||
Some(state_root),
|
||||
));
|
||||
} else if slot >= old_finalized_slot {
|
||||
return Err(PruningError::MissingInfoForCanonicalChain { slot }.into());
|
||||
} else {
|
||||
// We must assume here any candidate chains include the old finalized
|
||||
// checkpoint, i.e. there aren't any forks starting at a block that is a
|
||||
// strict ancestor of old_finalized_checkpoint.
|
||||
warn!(
|
||||
log,
|
||||
"Found a chain that should already have been pruned";
|
||||
"head_block_root" => format!("{:?}", head_hash),
|
||||
"head_slot" => head_slot,
|
||||
);
|
||||
break;
|
||||
}
|
||||
}
|
||||
Some(finalized_slot) => {
|
||||
// Block root is finalized, and we have reached the slot it was finalized
|
||||
// at: we've hit a shared part of the chain.
|
||||
if finalized_slot == slot {
|
||||
// The first finalized block of a candidate chain lies after (in terms
|
||||
// of slots order) the newly finalized block. It's not a candidate for
|
||||
// prunning.
|
||||
if finalized_slot == new_finalized_slot {
|
||||
Some((finalized_block_root, finalized_state_root)) => {
|
||||
// This fork descends from a newly finalized block, we can stop.
|
||||
if block_root == *finalized_block_root {
|
||||
// Sanity check: if the slot and block root match, then the
|
||||
// state roots should match too.
|
||||
if state_root != *finalized_state_root {
|
||||
return Err(PruningError::UnexpectedUnequalStateRoots.into());
|
||||
}
|
||||
|
||||
// If the fork descends from the whole finalized chain,
|
||||
// do not prune it. Otherwise continue to delete all
|
||||
// of the blocks and states that have been staged for
|
||||
// deletion so far.
|
||||
if slot == new_finalized_slot {
|
||||
potentially_abandoned_blocks.clear();
|
||||
potentially_abandoned_head.take();
|
||||
}
|
||||
|
||||
// If there are skipped slots on the fork to be pruned, then
|
||||
// we will have just staged the common block for deletion.
|
||||
// Unstage it.
|
||||
else {
|
||||
for (_, block_root, _) in
|
||||
potentially_abandoned_blocks.iter_mut().rev()
|
||||
{
|
||||
if block_root.as_ref() == Some(finalized_block_root) {
|
||||
*block_root = None;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
// Block root is finalized, but we're at a skip slot: delete the state only.
|
||||
else {
|
||||
} else {
|
||||
if state_root == *finalized_state_root {
|
||||
return Err(PruningError::UnexpectedEqualStateRoots.into());
|
||||
}
|
||||
potentially_abandoned_blocks.push((
|
||||
slot,
|
||||
None,
|
||||
Some(state_hash.into()),
|
||||
Some(block_root),
|
||||
Some(state_root),
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
abandoned_heads.extend(potentially_abandoned_head.into_iter());
|
||||
if !potentially_abandoned_blocks.is_empty() {
|
||||
if let Some(abandoned_head) = potentially_abandoned_head {
|
||||
debug!(
|
||||
log,
|
||||
"Pruning head";
|
||||
"head_block_root" => format!("{:?}", abandoned_head),
|
||||
"head_slot" => head_slot,
|
||||
);
|
||||
abandoned_heads.insert(abandoned_head);
|
||||
abandoned_blocks.extend(
|
||||
potentially_abandoned_blocks
|
||||
.iter()
|
||||
.filter_map(|(_, maybe_block_hash, _)| *maybe_block_hash),
|
||||
);
|
||||
abandoned_states.extend(potentially_abandoned_blocks.iter().filter_map(
|
||||
|(slot, _, maybe_state_hash)| match maybe_state_hash {
|
||||
None => None,
|
||||
Some(state_hash) => Some((*slot, *state_hash)),
|
||||
},
|
||||
|(slot, _, maybe_state_hash)| maybe_state_hash.map(|sr| (*slot, sr)),
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -166,6 +242,8 @@ pub trait Migrate<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>>:
|
||||
head_tracker.remove_head(head_hash);
|
||||
}
|
||||
|
||||
debug!(log, "Database pruning complete");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -184,48 +262,52 @@ impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> Migrate<E, Hot, Cold> fo
|
||||
/// Mostly useful for tests.
|
||||
pub struct BlockingMigrator<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> {
|
||||
db: Arc<HotColdDB<E, Hot, Cold>>,
|
||||
log: Logger,
|
||||
}
|
||||
|
||||
impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> Migrate<E, Hot, Cold>
|
||||
for BlockingMigrator<E, Hot, Cold>
|
||||
{
|
||||
fn new(db: Arc<HotColdDB<E, Hot, Cold>>, _: Logger) -> Self {
|
||||
BlockingMigrator { db }
|
||||
fn new(db: Arc<HotColdDB<E, Hot, Cold>>, log: Logger) -> Self {
|
||||
BlockingMigrator { db, log }
|
||||
}
|
||||
|
||||
fn process_finalization(
|
||||
&self,
|
||||
state_root: Hash256,
|
||||
finalized_state_root: BeaconStateHash,
|
||||
new_finalized_state: BeaconState<E>,
|
||||
_max_finality_distance: u64,
|
||||
head_tracker: Arc<HeadTracker>,
|
||||
old_finalized_block_hash: SignedBeaconBlockHash,
|
||||
new_finalized_block_hash: SignedBeaconBlockHash,
|
||||
old_finalized_checkpoint: Checkpoint,
|
||||
new_finalized_checkpoint: Checkpoint,
|
||||
) {
|
||||
if let Err(e) = Self::prune_abandoned_forks(
|
||||
self.db.clone(),
|
||||
head_tracker,
|
||||
old_finalized_block_hash,
|
||||
new_finalized_block_hash,
|
||||
new_finalized_state.slot,
|
||||
finalized_state_root,
|
||||
&new_finalized_state,
|
||||
old_finalized_checkpoint,
|
||||
new_finalized_checkpoint,
|
||||
&self.log,
|
||||
) {
|
||||
eprintln!("Pruning error: {:?}", e);
|
||||
error!(&self.log, "Pruning error"; "error" => format!("{:?}", e));
|
||||
}
|
||||
|
||||
if let Err(e) = process_finalization(self.db.clone(), state_root, &new_finalized_state) {
|
||||
// This migrator is only used for testing, so we just log to stderr without a logger.
|
||||
eprintln!("Migration error: {:?}", e);
|
||||
if let Err(e) = process_finalization(
|
||||
self.db.clone(),
|
||||
finalized_state_root.into(),
|
||||
&new_finalized_state,
|
||||
) {
|
||||
error!(&self.log, "Migration error"; "error" => format!("{:?}", e));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type MpscSender<E> = mpsc::Sender<(
|
||||
Hash256,
|
||||
BeaconStateHash,
|
||||
BeaconState<E>,
|
||||
Arc<HeadTracker>,
|
||||
SignedBeaconBlockHash,
|
||||
SignedBeaconBlockHash,
|
||||
Slot,
|
||||
Checkpoint,
|
||||
Checkpoint,
|
||||
)>;
|
||||
|
||||
/// Migrator that runs a background thread to migrate state from the hot to the cold database.
|
||||
@@ -243,34 +325,26 @@ impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> Migrate<E, Hot, Cold>
|
||||
Self { db, tx_thread, log }
|
||||
}
|
||||
|
||||
/// Perform the freezing operation on the database,
|
||||
fn process_finalization(
|
||||
&self,
|
||||
finalized_state_root: Hash256,
|
||||
finalized_state_root: BeaconStateHash,
|
||||
new_finalized_state: BeaconState<E>,
|
||||
max_finality_distance: u64,
|
||||
head_tracker: Arc<HeadTracker>,
|
||||
old_finalized_block_hash: SignedBeaconBlockHash,
|
||||
new_finalized_block_hash: SignedBeaconBlockHash,
|
||||
old_finalized_checkpoint: Checkpoint,
|
||||
new_finalized_checkpoint: Checkpoint,
|
||||
) {
|
||||
if !self.needs_migration(new_finalized_state.slot, max_finality_distance) {
|
||||
return;
|
||||
}
|
||||
|
||||
let (ref mut tx, ref mut thread) = *self.tx_thread.lock();
|
||||
|
||||
let new_finalized_slot = new_finalized_state.slot;
|
||||
if let Err(tx_err) = tx.send((
|
||||
finalized_state_root,
|
||||
new_finalized_state,
|
||||
head_tracker,
|
||||
old_finalized_block_hash,
|
||||
new_finalized_block_hash,
|
||||
new_finalized_slot,
|
||||
old_finalized_checkpoint,
|
||||
new_finalized_checkpoint,
|
||||
)) {
|
||||
let (new_tx, new_thread) = Self::spawn_thread(self.db.clone(), self.log.clone());
|
||||
|
||||
drop(mem::replace(tx, new_tx));
|
||||
*tx = new_tx;
|
||||
let old_thread = mem::replace(thread, new_thread);
|
||||
|
||||
// Join the old thread, which will probably have panicked, or may have
|
||||
@@ -290,53 +364,37 @@ impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> Migrate<E, Hot, Cold>
|
||||
}
|
||||
|
||||
impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> BackgroundMigrator<E, Hot, Cold> {
|
||||
/// Return true if a migration needs to be performed, given a new `finalized_slot`.
|
||||
fn needs_migration(&self, finalized_slot: Slot, max_finality_distance: u64) -> bool {
|
||||
let finality_distance = finalized_slot - self.db.get_split_slot();
|
||||
finality_distance > max_finality_distance
|
||||
}
|
||||
|
||||
#[allow(clippy::type_complexity)]
|
||||
/// Spawn a new child thread to run the migration process.
|
||||
///
|
||||
/// Return a channel handle for sending new finalized states to the thread.
|
||||
fn spawn_thread(
|
||||
db: Arc<HotColdDB<E, Hot, Cold>>,
|
||||
log: Logger,
|
||||
) -> (
|
||||
mpsc::Sender<(
|
||||
Hash256,
|
||||
BeaconState<E>,
|
||||
Arc<HeadTracker>,
|
||||
SignedBeaconBlockHash,
|
||||
SignedBeaconBlockHash,
|
||||
Slot,
|
||||
)>,
|
||||
thread::JoinHandle<()>,
|
||||
) {
|
||||
) -> (MpscSender<E>, thread::JoinHandle<()>) {
|
||||
let (tx, rx) = mpsc::channel();
|
||||
let thread = thread::spawn(move || {
|
||||
while let Ok((
|
||||
state_root,
|
||||
state,
|
||||
head_tracker,
|
||||
old_finalized_block_hash,
|
||||
new_finalized_block_hash,
|
||||
new_finalized_slot,
|
||||
old_finalized_checkpoint,
|
||||
new_finalized_checkpoint,
|
||||
)) = rx.recv()
|
||||
{
|
||||
match Self::prune_abandoned_forks(
|
||||
db.clone(),
|
||||
head_tracker,
|
||||
old_finalized_block_hash,
|
||||
new_finalized_block_hash,
|
||||
new_finalized_slot,
|
||||
state_root,
|
||||
&state,
|
||||
old_finalized_checkpoint,
|
||||
new_finalized_checkpoint,
|
||||
&log,
|
||||
) {
|
||||
Ok(()) => {}
|
||||
Err(e) => warn!(log, "Block pruning failed: {:?}", e),
|
||||
}
|
||||
|
||||
match process_finalization(db.clone(), state_root, &state) {
|
||||
match process_finalization(db.clone(), state_root.into(), &state) {
|
||||
Ok(()) => {}
|
||||
Err(Error::HotColdDBError(HotColdDBError::FreezeSlotUnaligned(slot))) => {
|
||||
debug!(
|
||||
|
||||
Reference in New Issue
Block a user