Hierarchical state diffs in hot DB (#6750)

This PR implements https://github.com/sigp/lighthouse/pull/5978 (tree-states) but on the hot DB. It allows Lighthouse to massively reduce its disk footprint during non-finality and overall I/O in all cases.

Closes https://github.com/sigp/lighthouse/issues/6580

Conga into https://github.com/sigp/lighthouse/pull/6744

### TODOs

- [x] Fix OOM in CI https://github.com/sigp/lighthouse/pull/7176
- [x] optimise store_hot_state to avoid storing a duplicate state if the summary already exists (should be safe from races now that pruning is cleaner)
- [x] mispelled: get_ancenstor_state_root
- [x] get_ancestor_state_root should use state summaries
- [x] Prevent split from changing during ancestor calc
- [x] Use same hierarchy for hot and cold

### TODO Good optimization for future PRs

- [ ] On the migration, if the latest hot snapshot is aligned with the cold snapshot migrate the diffs instead of the full states.
```
align slot  time
10485760    Nov-26-2024
12582912    Sep-14-2025
14680064    Jul-02-2026
```

### TODO Maybe things good to have

- [ ] Rename anchor_slot https://github.com/sigp/lighthouse/compare/tree-states-hot-rebase-oom...dapplion:lighthouse:tree-states-hot-anchor-slot-rename?expand=1
- [ ] Make anchor fields not public such that they must be mutated through a method. To prevent un-wanted changes of the anchor_slot

### NOTTODO

- [ ] Use fork-choice and a new method [`descendants_of_checkpoint`](ca2388e196 (diff-046fbdb517ca16b80e4464c2c824cf001a74a0a94ac0065e635768ac391062a8)) to filter only the state summaries that descend of finalized checkpoint]
This commit is contained in:
Lion - dapplion
2025-06-19 04:43:25 +02:00
committed by GitHub
parent 6786b9d12a
commit dd98534158
33 changed files with 2695 additions and 812 deletions

View File

@@ -44,8 +44,8 @@ use store::{Error as StoreError, HotColdDB, ItemStore, KeyValueStoreOp};
use task_executor::{ShutdownReason, TaskExecutor};
use tracing::{debug, error, info};
use types::{
BeaconBlock, BeaconState, BlobSidecarList, ChainSpec, Checkpoint, DataColumnSidecarList, Epoch,
EthSpec, FixedBytesExtended, Hash256, Signature, SignedBeaconBlock, Slot,
BeaconBlock, BeaconState, BlobSidecarList, ChainSpec, DataColumnSidecarList, Epoch, EthSpec,
FixedBytesExtended, Hash256, Signature, SignedBeaconBlock, Slot,
};
/// An empty struct used to "witness" all the `BeaconChainTypes` traits. It has no user-facing
@@ -382,21 +382,29 @@ where
}
/// Starts a new chain from a genesis state.
pub fn genesis_state(mut self, beacon_state: BeaconState<E>) -> Result<Self, String> {
pub fn genesis_state(mut self, mut beacon_state: BeaconState<E>) -> Result<Self, String> {
let store = self.store.clone().ok_or("genesis_state requires a store")?;
// Initialize anchor info before attempting to write the genesis state.
// Since v4.4.0 we will set the anchor with a dummy state upper limit in order to prevent
// historic states from being retained (unless `--reconstruct-historic-states` is set).
let retain_historic_states = self.chain_config.reconstruct_historic_states;
let genesis_beacon_block = genesis_block(&mut beacon_state, &self.spec)?;
self.pending_io_batch.push(
store
.init_anchor_info(
genesis_beacon_block.parent_root(),
genesis_beacon_block.slot(),
Slot::new(0),
retain_historic_states,
)
.map_err(|e| format!("Failed to initialize genesis anchor: {:?}", e))?,
);
let (genesis, updated_builder) = self.set_genesis_state(beacon_state)?;
self = updated_builder;
// Stage the database's metadata fields for atomic storage when `build` is called.
// Since v4.4.0 we will set the anchor with a dummy state upper limit in order to prevent
// historic states from being retained (unless `--reconstruct-historic-states` is set).
let retain_historic_states = self.chain_config.reconstruct_historic_states;
self.pending_io_batch.push(
store
.init_anchor_info(genesis.beacon_block.message(), retain_historic_states)
.map_err(|e| format!("Failed to initialize genesis anchor: {:?}", e))?,
);
self.pending_io_batch.push(
store
.init_blob_info(genesis.beacon_block.slot())
@@ -521,6 +529,13 @@ where
}
}
debug!(
slot = %weak_subj_slot,
state_root = ?weak_subj_state_root,
block_root = ?weak_subj_block_root,
"Storing split from weak subjectivity state"
);
// Set the store's split point *before* storing genesis so that genesis is stored
// immediately in the freezer DB.
store.set_split(weak_subj_slot, weak_subj_state_root, weak_subj_block_root);
@@ -541,6 +556,26 @@ where
.cold_db
.do_atomically(block_root_batch)
.map_err(|e| format!("Error writing frozen block roots: {e:?}"))?;
debug!(
from = %weak_subj_block.slot(),
to_excl = %weak_subj_state.slot(),
block_root = ?weak_subj_block_root,
"Stored frozen block roots at skipped slots"
);
// Write the anchor to memory before calling `put_state` otherwise hot hdiff can't store
// states that do not align with the `start_slot` grid.
let retain_historic_states = self.chain_config.reconstruct_historic_states;
self.pending_io_batch.push(
store
.init_anchor_info(
weak_subj_block.parent_root(),
weak_subj_block.slot(),
weak_subj_slot,
retain_historic_states,
)
.map_err(|e| format!("Failed to initialize anchor info: {:?}", e))?,
);
// Write the state, block and blobs non-atomically, it doesn't matter if they're forgotten
// about on a crash restart.
@@ -551,6 +586,8 @@ where
weak_subj_state.clone(),
)
.map_err(|e| format!("Failed to set checkpoint state as finalized state: {:?}", e))?;
// Note: post hot hdiff must update the anchor info before attempting to put_state otherwise
// the write will fail if the weak_subj_slot is not aligned with the snapshot moduli.
store
.put_state(&weak_subj_state_root, &weak_subj_state)
.map_err(|e| format!("Failed to store weak subjectivity state: {e:?}"))?;
@@ -580,13 +617,7 @@ where
// Stage the database's metadata fields for atomic storage when `build` is called.
// This prevents the database from restarting in an inconsistent state if the anchor
// info or split point is written before the `PersistedBeaconChain`.
let retain_historic_states = self.chain_config.reconstruct_historic_states;
self.pending_io_batch.push(store.store_split_in_batch());
self.pending_io_batch.push(
store
.init_anchor_info(weak_subj_block.message(), retain_historic_states)
.map_err(|e| format!("Failed to initialize anchor info: {:?}", e))?,
);
self.pending_io_batch.push(
store
.init_blob_info(weak_subj_block.slot())
@@ -598,13 +629,6 @@ where
.map_err(|e| format!("Failed to initialize data column info: {:?}", e))?,
);
// Store pruning checkpoint to prevent attempting to prune before the anchor state.
self.pending_io_batch
.push(store.pruning_checkpoint_store_op(Checkpoint {
root: weak_subj_block_root,
epoch: weak_subj_state.slot().epoch(E::slots_per_epoch()),
}));
let snapshot = BeaconSnapshot {
beacon_block_root: weak_subj_block_root,
beacon_block: Arc::new(weak_subj_block),