Hierarchical state diffs in hot DB (#6750)

This PR implements https://github.com/sigp/lighthouse/pull/5978 (tree-states) but on the hot DB. It allows Lighthouse to massively reduce its disk footprint during non-finality and overall I/O in all cases.

Closes https://github.com/sigp/lighthouse/issues/6580

Conga into https://github.com/sigp/lighthouse/pull/6744

### TODOs

- [x] Fix OOM in CI https://github.com/sigp/lighthouse/pull/7176
- [x] optimise store_hot_state to avoid storing a duplicate state if the summary already exists (should be safe from races now that pruning is cleaner)
- [x] mispelled: get_ancenstor_state_root
- [x] get_ancestor_state_root should use state summaries
- [x] Prevent split from changing during ancestor calc
- [x] Use same hierarchy for hot and cold

### TODO Good optimization for future PRs

- [ ] On the migration, if the latest hot snapshot is aligned with the cold snapshot migrate the diffs instead of the full states.
```
align slot  time
10485760    Nov-26-2024
12582912    Sep-14-2025
14680064    Jul-02-2026
```

### TODO Maybe things good to have

- [ ] Rename anchor_slot https://github.com/sigp/lighthouse/compare/tree-states-hot-rebase-oom...dapplion:lighthouse:tree-states-hot-anchor-slot-rename?expand=1
- [ ] Make anchor fields not public such that they must be mutated through a method. To prevent un-wanted changes of the anchor_slot

### NOTTODO

- [ ] Use fork-choice and a new method [`descendants_of_checkpoint`](ca2388e196 (diff-046fbdb517ca16b80e4464c2c824cf001a74a0a94ac0065e635768ac391062a8)) to filter only the state summaries that descend of finalized checkpoint]
This commit is contained in:
Lion - dapplion
2025-06-19 04:43:25 +02:00
committed by GitHub
parent 6786b9d12a
commit dd98534158
33 changed files with 2695 additions and 812 deletions

View File

@@ -27,6 +27,7 @@ pub enum Error {
Compression(std::io::Error),
InvalidSszState(ssz::DecodeError),
InvalidBalancesLength,
LessThanStart(Slot, Slot),
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Encode, Decode)]
@@ -67,6 +68,10 @@ impl FromStr for HierarchyConfig {
return Err("hierarchy-exponents must be in ascending order".to_string());
}
if exponents.is_empty() {
return Err("empty exponents".to_string());
}
Ok(HierarchyConfig { exponents })
}
}
@@ -478,7 +483,9 @@ impl ValidatorsDiff {
Hash256::ZERO
},
// effective_balance can increase and decrease
effective_balance: y.effective_balance - x.effective_balance,
effective_balance: y
.effective_balance
.wrapping_sub(x.effective_balance),
// slashed can only change from false into true. In an index re-use it can
// switch back to false, but in that case the pubkey will also change.
slashed: y.slashed,
@@ -642,10 +649,26 @@ impl HierarchyConfig {
Err(Error::InvalidHierarchy)
}
}
pub fn exponent_for_slot(slot: Slot) -> u32 {
slot.as_u64().trailing_zeros()
}
}
impl HierarchyModuli {
pub fn storage_strategy(&self, slot: Slot) -> Result<StorageStrategy, Error> {
/// * `slot` - Slot of the storage strategy
/// * `start_slot` - Slot before which states are not available. Initial snapshot point, which
/// may not be aligned to the hierarchy moduli values. Given an example of
/// exponents [5,13,21], to reconstruct state at slot 3,000,003: if start = 3,000,002
/// layer 2 diff will point to the start snapshot instead of the layer 1 diff at
/// 2998272.
pub fn storage_strategy(&self, slot: Slot, start_slot: Slot) -> Result<StorageStrategy, Error> {
match slot.cmp(&start_slot) {
Ordering::Less => return Err(Error::LessThanStart(slot, start_slot)),
Ordering::Equal => return Ok(StorageStrategy::Snapshot),
Ordering::Greater => {} // continue
}
// last = full snapshot interval
let last = self.moduli.last().copied().ok_or(Error::InvalidHierarchy)?;
// first = most frequent diff layer, need to replay blocks from this layer
@@ -667,14 +690,22 @@ impl HierarchyModuli {
.find_map(|(&n_big, &n_small)| {
if slot % n_small == 0 {
// Diff from the previous layer.
Some(StorageStrategy::DiffFrom(slot / n_big * n_big))
let from = slot / n_big * n_big;
// Or from start point
let from = std::cmp::max(from, start_slot);
Some(StorageStrategy::DiffFrom(from))
} else {
// Keep trying with next layer
None
}
})
// Exhausted layers, need to replay from most frequent layer
.unwrap_or(StorageStrategy::ReplayFrom(slot / first * first)))
.unwrap_or_else(|| {
let from = slot / first * first;
// Or from start point
let from = std::cmp::max(from, start_slot);
StorageStrategy::ReplayFrom(from)
}))
}
/// Return the smallest slot greater than or equal to `slot` at which a full snapshot should
@@ -703,6 +734,26 @@ impl HierarchyModuli {
|second_layer_moduli| Ok(slot % *second_layer_moduli == 0),
)
}
/// For each layer, returns the closest diff less than or equal to `slot`.
pub fn closest_layer_points(&self, slot: Slot, start_slot: Slot) -> Vec<Slot> {
let mut layers = self
.moduli
.iter()
.map(|&n| {
let from = slot / n * n;
// Or from start point
std::cmp::max(from, start_slot)
})
.collect::<Vec<_>>();
// Remove duplication caused by the capping at `start_slot` (multiple
// layers may have the same slot equal to `start_slot`), or shared multiples (a slot that is
// a multiple of 2**n will also be a multiple of 2**m for all m < n).
layers.dedup();
layers
}
}
impl StorageStrategy {
@@ -732,6 +783,27 @@ impl StorageStrategy {
}
.map(Slot::from)
}
/// Returns the slot that storage_strategy points to.
pub fn diff_base_slot(&self) -> Option<Slot> {
match self {
Self::ReplayFrom(from) => Some(*from),
Self::DiffFrom(from) => Some(*from),
Self::Snapshot => None,
}
}
pub fn is_replay_from(&self) -> bool {
matches!(self, Self::ReplayFrom(_))
}
pub fn is_diff_from(&self) -> bool {
matches!(self, Self::DiffFrom(_))
}
pub fn is_snapshot(&self) -> bool {
matches!(self, Self::Snapshot)
}
}
#[cfg(test)]
@@ -743,34 +815,37 @@ mod tests {
fn default_storage_strategy() {
let config = HierarchyConfig::default();
config.validate().unwrap();
let sslot = Slot::new(0);
let moduli = config.to_moduli().unwrap();
// Full snapshots at multiples of 2^21.
let snapshot_freq = Slot::new(1 << 21);
assert_eq!(
moduli.storage_strategy(Slot::new(0)).unwrap(),
moduli.storage_strategy(Slot::new(0), sslot).unwrap(),
StorageStrategy::Snapshot
);
assert_eq!(
moduli.storage_strategy(snapshot_freq).unwrap(),
moduli.storage_strategy(snapshot_freq, sslot).unwrap(),
StorageStrategy::Snapshot
);
assert_eq!(
moduli.storage_strategy(snapshot_freq * 3).unwrap(),
moduli.storage_strategy(snapshot_freq * 3, sslot).unwrap(),
StorageStrategy::Snapshot
);
// Diffs should be from the previous layer (the snapshot in this case), and not the previous diff in the same layer.
let first_layer = Slot::new(1 << 18);
assert_eq!(
moduli.storage_strategy(first_layer * 2).unwrap(),
moduli.storage_strategy(first_layer * 2, sslot).unwrap(),
StorageStrategy::DiffFrom(Slot::new(0))
);
let replay_strategy_slot = first_layer + 1;
assert_eq!(
moduli.storage_strategy(replay_strategy_slot).unwrap(),
moduli
.storage_strategy(replay_strategy_slot, sslot)
.unwrap(),
StorageStrategy::ReplayFrom(first_layer)
);
}
@@ -940,4 +1015,93 @@ mod tests {
]
);
}
// Test that the diffs and snapshots required for storage of split states are retained in the
// hot DB as the split slot advances, if we begin from an initial configuration where this
// invariant holds.
fn test_slots_retained_invariant(hierarchy: HierarchyModuli, start_slot: u64, epoch_jump: u64) {
let start_slot = Slot::new(start_slot);
let mut finalized_slot = start_slot;
// Initially we have just one snapshot stored at the `start_slot`. This is what checkpoint
// sync sets up (or the V24 migration).
let mut retained_slots = vec![finalized_slot];
// Iterate until we've reached two snapshots in the future.
let stop_at = hierarchy
.next_snapshot_slot(hierarchy.next_snapshot_slot(start_slot).unwrap() + 1)
.unwrap();
while finalized_slot <= stop_at {
// Jump multiple epocsh at a time because inter-epoch states are not interesting and
// would take too long to iterate over.
let new_finalized_slot = finalized_slot + 32 * epoch_jump;
let new_retained_slots = hierarchy.closest_layer_points(new_finalized_slot, start_slot);
for slot in &new_retained_slots {
// All new retained slots must either be already stored prior to the old finalized
// slot, OR newer than the finalized slot (i.e. stored in the hot DB as part of
// regular state storage).
assert!(retained_slots.contains(slot) || *slot >= finalized_slot);
}
retained_slots = new_retained_slots;
finalized_slot = new_finalized_slot;
}
}
#[test]
fn slots_retained_invariant() {
let cases = [
// Default hierarchy with a start_slot between the 2^13 and 2^16 layers.
(
HierarchyConfig::default().to_moduli().unwrap(),
2 * (1 << 14) - 5 * 32,
1,
),
// Default hierarchy with a start_slot between the 2^13 and 2^16 layers, with 8 epochs
// finalizing at a time (should not make any difference).
(
HierarchyConfig::default().to_moduli().unwrap(),
2 * (1 << 14) - 5 * 32,
8,
),
// Very dense hierarchy config.
(
HierarchyConfig::from_str("5,7")
.unwrap()
.to_moduli()
.unwrap(),
32,
1,
),
// Very dense hierarchy config that skips a whole snapshot on its first finalization.
(
HierarchyConfig::from_str("5,7")
.unwrap()
.to_moduli()
.unwrap(),
32,
1 << 7,
),
];
for (hierarchy, start_slot, epoch_jump) in cases {
test_slots_retained_invariant(hierarchy, start_slot, epoch_jump);
}
}
#[test]
fn closest_layer_points_unique() {
let hierarchy = HierarchyConfig::default().to_moduli().unwrap();
let start_slot = Slot::new(0);
let end_slot = hierarchy.next_snapshot_slot(Slot::new(1)).unwrap();
for slot in (0..end_slot.as_u64()).map(Slot::new) {
let closest_layer_points = hierarchy.closest_layer_points(slot, start_slot);
assert!(closest_layer_points.is_sorted_by(|a, b| a > b));
}
}
}