Make DAG construction more permissive (#7460)

Workaround/fix for:

- https://github.com/sigp/lighthouse/issues/7323


  - Remove the `StateSummariesNotContiguousError`. This allows us to continue with DAG construction and pruning, even in the case where the DAG is disjointed. We will treat any disjoint summaries as roots of their own tree, and prune them (as they are not descended from finalized). This should be safe, as canonical summaries should not be disjoint (if they are, then the DB is already corrupt).
This commit is contained in:
Michael Sproul
2025-05-15 12:15:35 +10:00
committed by GitHub
parent 851ee2bced
commit c2c7fb87a8
4 changed files with 31 additions and 37 deletions

View File

@@ -572,7 +572,8 @@ impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> BackgroundMigrator<E, Ho
if state_summaries_dag_roots.len() > 1 {
warn!(
state_summaries_dag_roots = ?state_summaries_dag_roots,
"Prune state summaries dag found more than one root"
error = "summaries dag found more than one root",
"Notify the devs your hot DB has some inconsistency. Pruning will fix it but devs want to know about it",
);
}

View File

@@ -41,7 +41,7 @@ pub fn upgrade_to_v22<T: BeaconChainTypes>(
db: Arc<HotColdDB<T::EthSpec, T::HotStore, T::ColdStore>>,
genesis_state_root: Option<Hash256>,
) -> Result<(), Error> {
info!("Upgrading from v21 to v22");
info!("Upgrading DB schema from v21 to v22");
let old_anchor = db.get_anchor_info();

View File

@@ -8,6 +8,7 @@ use ssz::{Decode, Encode};
use ssz_derive::{Decode, Encode};
use std::sync::Arc;
use store::{DBColumn, Error, HotColdDB, KeyValueStore, KeyValueStoreOp, StoreItem};
use tracing::{debug, info};
use types::{Hash256, Slot};
/// Dummy value to use for the canonical head block root, see below.
@@ -16,6 +17,8 @@ pub const DUMMY_CANONICAL_HEAD_BLOCK_ROOT: Hash256 = Hash256::repeat_byte(0xff);
pub fn upgrade_to_v23<T: BeaconChainTypes>(
db: Arc<HotColdDB<T::EthSpec, T::HotStore, T::ColdStore>>,
) -> Result<Vec<KeyValueStoreOp>, Error> {
info!("Upgrading DB schema from v22 to v23");
// 1) Set the head-tracker to empty
let Some(persisted_beacon_chain_v22) =
db.get_item::<PersistedBeaconChainV22>(&BEACON_CHAIN_DB_KEY)?
@@ -37,10 +40,24 @@ pub fn upgrade_to_v23<T: BeaconChainTypes>(
.hot_db
.iter_column_keys::<Hash256>(DBColumn::BeaconStateTemporary)
{
let state_root = state_root_result?;
debug!(
?state_root,
"Deleting temporary state flag on v23 schema migration"
);
ops.push(KeyValueStoreOp::DeleteKey(
DBColumn::BeaconStateTemporary,
state_root_result?.as_slice().to_vec(),
state_root.as_slice().to_vec(),
));
// Here we SHOULD delete the items for key `state_root` in columns `BeaconState` and
// `BeaconStateSummary`. However, in the event we have dangling temporary states at the time
// of the migration, the first pruning routine will prune them. They will be a tree branch /
// root not part of the finalized tree and trigger a warning log once.
//
// We believe there may be race conditions concerning temporary flags where a necessary
// canonical state is marked as temporary. In current stable, a restart with that DB will
// corrupt the DB. In the unlikely case this happens we choose to leave the states and
// allow pruning to clean them.
}
Ok(ops)

View File

@@ -43,13 +43,6 @@ pub enum Error {
state_root: Hash256,
latest_block_root: Hash256,
},
StateSummariesNotContiguous {
state_root: Hash256,
state_slot: Slot,
latest_block_root: Hash256,
parent_block_root: Box<Hash256>,
parent_block_latest_state_summary: Box<Option<(Slot, Hash256)>>,
},
MissingChildStateRoot(Hash256),
RequestedSlotAboveSummary {
starting_state_root: Hash256,
@@ -163,34 +156,17 @@ impl StateSummariesDAG {
**state_root
} else {
// Common case: not a skipped slot.
//
// If we can't find a state summmary for the parent block and previous slot,
// then there is some amount of disjointedness in the DAG. We set the parent
// state root to 0x0 in this case, and will prune any dangling states.
let parent_block_root = summary.block_parent_root;
if let Some(parent_block_summaries) =
state_summaries_by_block_root.get(&parent_block_root)
{
*parent_block_summaries
.get(&previous_slot)
// Should never error: summaries are contiguous, so if there's an
// entry it must contain at least one summary at the previous slot.
.ok_or(Error::StateSummariesNotContiguous {
state_root: *state_root,
state_slot: summary.slot,
latest_block_root: summary.latest_block_root,
parent_block_root: parent_block_root.into(),
parent_block_latest_state_summary: parent_block_summaries
.iter()
.max_by(|a, b| a.0.cmp(b.0))
.map(|(slot, (state_root, _))| (*slot, **state_root))
.into(),
})?
.0
} else {
// We don't know of any summary with this parent block root. We'll
// consider this summary to be a root of `state_summaries_v22`
// collection and mark it as zero.
// The test store_tests::finalizes_non_epoch_start_slot manages to send two
// disjoint trees on its second migration.
Hash256::ZERO
}
state_summaries_by_block_root
.get(&parent_block_root)
.and_then(|parent_block_summaries| {
parent_block_summaries.get(&previous_slot)
})
.map_or(Hash256::ZERO, |(parent_state_root, _)| **parent_state_root)
}
};