Restore crash safety for database pruning (#4975)

* Add some DB sanity checks * Restore crash safety for database pruning
2026-04-19 13:58:28 +00:00 · 2023-12-04 17:15:25 +11:00
parent 66d30bc0bc
commit cefe9fdf70
4 changed files with 59 additions and 75 deletions
--- a/beacon_node/beacon_chain/src/builder.rs
+++ b/beacon_node/beacon_chain/src/builder.rs
@@ -34,7 +34,7 @@ use std::time::Duration;
 use store::{Error as StoreError, HotColdDB, ItemStore, KeyValueStoreOp};
 use task_executor::{ShutdownReason, TaskExecutor};
 use types::{
-    BeaconBlock, BeaconState, ChainSpec, Checkpoint, Epoch, EthSpec, Graffiti, Hash256, Signature,
+    BeaconBlock, BeaconState, ChainSpec, Epoch, EthSpec, Graffiti, Hash256, Signature,
    SignedBeaconBlock, Slot,
 };

@@ -559,16 +559,6 @@ where
                .map_err(|e| format!("Failed to initialize blob info: {:?}", e))?,
        );

-        // Store pruning checkpoint to prevent attempting to prune before the anchor state.
-        self.pending_io_batch.push(
-            store
-                .pruning_checkpoint_store_op(Checkpoint {
-                    root: weak_subj_block_root,
-                    epoch: weak_subj_state.slot().epoch(TEthSpec::slots_per_epoch()),
-                })
-                .map_err(|e| format!("{:?}", e))?,
-        );
-
        let snapshot = BeaconSnapshot {
            beacon_block_root: weak_subj_block_root,
            beacon_block: Arc::new(weak_subj_block),
--- a/beacon_node/beacon_chain/src/migrate.rs
+++ b/beacon_node/beacon_chain/src/migrate.rs
@@ -512,13 +512,7 @@ impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> BackgroundMigrator<E, Ho
        genesis_block_root: Hash256,
        log: &Logger,
    ) -> Result<PruningOutcome, BeaconChainError> {
-        let old_finalized_checkpoint =
-            store
-                .load_pruning_checkpoint()?
-                .unwrap_or_else(|| Checkpoint {
-                    epoch: Epoch::new(0),
-                    root: Hash256::zero(),
-                });
+        let old_finalized_checkpoint = store.get_pruning_checkpoint();

        let old_finalized_slot = old_finalized_checkpoint
            .epoch
@@ -572,6 +566,21 @@ impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> BackgroundMigrator<E, Ho
            })
            .collect::<Result<_, _>>()?;

+        // Quick sanity check. If the canonical block & state roots are incorrect then we could
+        // incorrectly delete canonical states, which would corrupt the database.
+        let expected_canonical_block_roots = new_finalized_slot
+            .saturating_sub(old_finalized_slot)
+            .as_usize()
+            .saturating_add(1);
+        if newly_finalized_chain.len() != expected_canonical_block_roots {
+            return Err(BeaconChainError::DBInconsistent(format!(
+                "canonical chain iterator is corrupt; \
+                 expected {} but got {} block roots",
+                expected_canonical_block_roots,
+                newly_finalized_chain.len()
+            )));
+        }
+
        // We don't know which blocks are shared among abandoned chains, so we buffer and delete
        // everything in one fell swoop.
        let mut abandoned_blocks: HashSet<SignedBeaconBlockHash> = HashSet::new();
@@ -735,11 +744,6 @@ impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> BackgroundMigrator<E, Ho
            persisted_head.as_kv_store_op(BEACON_CHAIN_DB_KEY)?,
        ));

-        // Persist the new finalized checkpoint as the pruning checkpoint.
-        batch.push(StoreOp::KeyValueOp(
-            store.pruning_checkpoint_store_op(new_finalized_checkpoint)?,
-        ));
-
        store.do_atomically_with_block_and_blobs_cache(batch)?;
        debug!(
            log,
@@ -753,19 +757,26 @@ impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> BackgroundMigrator<E, Ho
            let (state_root, summary) = res?;

            if summary.slot <= new_finalized_slot {
-                // If state root doesn't match state root from canonical chain, or this slot
-                // is not part of the recently finalized chain, then delete.
+                // If state root doesn't match state root from canonical chain, then delete.
+                // We may also find older states here that should have been deleted by `migrate_db`
+                // but weren't due to wonky I/O atomicity.
                if newly_finalized_chain
                    .get(&summary.slot)
                    .map_or(true, |(_, canonical_state_root)| {
                        state_root != Hash256::from(*canonical_state_root)
                    })
                {
+                    let reason = if summary.slot < old_finalized_slot {
+                        "old dangling state"
+                    } else {
+                        "non-canonical"
+                    };
                    debug!(
                        log,
                        "Deleting state";
                        "state_root" => ?state_root,
                        "slot" => summary.slot,
+                        "reason" => reason,
                    );
                    state_delete_batch.push(StoreOp::DeleteState(state_root, Some(summary.slot)));
                }