Restore crash safety for database pruning (#4975)

* Add some DB sanity checks

* Restore crash safety for database pruning
This commit is contained in:
Michael Sproul
2023-12-04 17:15:25 +11:00
committed by GitHub
parent 66d30bc0bc
commit cefe9fdf70
4 changed files with 59 additions and 75 deletions

View File

@@ -34,7 +34,7 @@ use std::time::Duration;
use store::{Error as StoreError, HotColdDB, ItemStore, KeyValueStoreOp};
use task_executor::{ShutdownReason, TaskExecutor};
use types::{
BeaconBlock, BeaconState, ChainSpec, Checkpoint, Epoch, EthSpec, Graffiti, Hash256, Signature,
BeaconBlock, BeaconState, ChainSpec, Epoch, EthSpec, Graffiti, Hash256, Signature,
SignedBeaconBlock, Slot,
};
@@ -559,16 +559,6 @@ where
.map_err(|e| format!("Failed to initialize blob info: {:?}", e))?,
);
// Store pruning checkpoint to prevent attempting to prune before the anchor state.
self.pending_io_batch.push(
store
.pruning_checkpoint_store_op(Checkpoint {
root: weak_subj_block_root,
epoch: weak_subj_state.slot().epoch(TEthSpec::slots_per_epoch()),
})
.map_err(|e| format!("{:?}", e))?,
);
let snapshot = BeaconSnapshot {
beacon_block_root: weak_subj_block_root,
beacon_block: Arc::new(weak_subj_block),

View File

@@ -512,13 +512,7 @@ impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> BackgroundMigrator<E, Ho
genesis_block_root: Hash256,
log: &Logger,
) -> Result<PruningOutcome, BeaconChainError> {
let old_finalized_checkpoint =
store
.load_pruning_checkpoint()?
.unwrap_or_else(|| Checkpoint {
epoch: Epoch::new(0),
root: Hash256::zero(),
});
let old_finalized_checkpoint = store.get_pruning_checkpoint();
let old_finalized_slot = old_finalized_checkpoint
.epoch
@@ -572,6 +566,21 @@ impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> BackgroundMigrator<E, Ho
})
.collect::<Result<_, _>>()?;
// Quick sanity check. If the canonical block & state roots are incorrect then we could
// incorrectly delete canonical states, which would corrupt the database.
let expected_canonical_block_roots = new_finalized_slot
.saturating_sub(old_finalized_slot)
.as_usize()
.saturating_add(1);
if newly_finalized_chain.len() != expected_canonical_block_roots {
return Err(BeaconChainError::DBInconsistent(format!(
"canonical chain iterator is corrupt; \
expected {} but got {} block roots",
expected_canonical_block_roots,
newly_finalized_chain.len()
)));
}
// We don't know which blocks are shared among abandoned chains, so we buffer and delete
// everything in one fell swoop.
let mut abandoned_blocks: HashSet<SignedBeaconBlockHash> = HashSet::new();
@@ -735,11 +744,6 @@ impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> BackgroundMigrator<E, Ho
persisted_head.as_kv_store_op(BEACON_CHAIN_DB_KEY)?,
));
// Persist the new finalized checkpoint as the pruning checkpoint.
batch.push(StoreOp::KeyValueOp(
store.pruning_checkpoint_store_op(new_finalized_checkpoint)?,
));
store.do_atomically_with_block_and_blobs_cache(batch)?;
debug!(
log,
@@ -753,19 +757,26 @@ impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> BackgroundMigrator<E, Ho
let (state_root, summary) = res?;
if summary.slot <= new_finalized_slot {
// If state root doesn't match state root from canonical chain, or this slot
// is not part of the recently finalized chain, then delete.
// If state root doesn't match state root from canonical chain, then delete.
// We may also find older states here that should have been deleted by `migrate_db`
// but weren't due to wonky I/O atomicity.
if newly_finalized_chain
.get(&summary.slot)
.map_or(true, |(_, canonical_state_root)| {
state_root != Hash256::from(*canonical_state_root)
})
{
let reason = if summary.slot < old_finalized_slot {
"old dangling state"
} else {
"non-canonical"
};
debug!(
log,
"Deleting state";
"state_root" => ?state_root,
"slot" => summary.slot,
"reason" => reason,
);
state_delete_batch.push(StoreOp::DeleteState(state_root, Some(summary.slot)));
}