From 918121e3134134e87dae9cc32b029ccf635d846b Mon Sep 17 00:00:00 2001 From: Michael Sproul Date: Tue, 12 Aug 2025 12:19:24 +1000 Subject: [PATCH] Fix bugs in rebasing of states prior to finalization (#7849) Attempt to fix this error reported by `beaconcha.in` on their Hoodi archive nodes: > {"code":500,"message":"UNHANDLED_ERROR: DBError(CacheBuildError(BeaconState(MilhouseError(OutOfBoundsIterFrom { index: 1199549, len: 1060000 }))))","stacktraces":[]} There are only a handful of places where we call `iter_from`. This one is safe by construction (the check immediately prior ensures `self.pubkeys.len()` is not out of bounds): https://github.com/sigp/lighthouse/blob/cfb1f7331064b758c6786e4e1dc15507af5ff5d1/beacon_node/beacon_chain/src/validator_pubkey_cache.rs#L84-L90 This one should also be safe, and the indexes used here would not be as large as the ones in the reported error: https://github.com/sigp/lighthouse/blob/cfb1f7331064b758c6786e4e1dc15507af5ff5d1/consensus/state_processing/src/per_epoch_processing/single_pass.rs#L365-L368 Which leaves one remaining usage which must be the culprit: https://github.com/sigp/lighthouse/blob/cfb1f7331064b758c6786e4e1dc15507af5ff5d1/consensus/types/src/beacon_state.rs#L2109-L2113 This indexing relies on the invariant that `self.pubkey_cache().len() <= self.validators.len()`. We mostly maintain that invariant, except for in `rebase_caches_on` (fixed in this PR). The other bug, is that we were calling `rebase_on_finalized` for all "hot" states, which post-v7.1.0 includes states prior to the split which are required by the hdiff grid. This is how we end up calling something like `genesis_state.rebase_on(&split_state)`, which then corrupts the pubkey cache of the genesis state using the newer pubkey cache from the split state. --- beacon_node/store/src/state_cache.rs | 7 ++++++- consensus/types/src/beacon_state.rs | 10 ++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/beacon_node/store/src/state_cache.rs b/beacon_node/store/src/state_cache.rs index 30d0e2b2d5..352760e808 100644 --- a/beacon_node/store/src/state_cache.rs +++ b/beacon_node/store/src/state_cache.rs @@ -186,8 +186,13 @@ impl StateCache { state: &mut BeaconState, spec: &ChainSpec, ) -> Result<(), Error> { + // Do not attempt to rebase states prior to the finalized state. This method might be called + // with states on the hdiff grid prior to finalization, as part of the reconstruction of + // some later unfinalized state. if let Some(finalized_state) = &self.finalized_state { - state.rebase_on(&finalized_state.state, spec)?; + if state.slot() >= finalized_state.state.slot() { + state.rebase_on(&finalized_state.state, spec)?; + } } Ok(()) } diff --git a/consensus/types/src/beacon_state.rs b/consensus/types/src/beacon_state.rs index c7dfabbfee..8bfbce87d3 100644 --- a/consensus/types/src/beacon_state.rs +++ b/consensus/types/src/beacon_state.rs @@ -2535,11 +2535,17 @@ impl BeaconState { pub fn rebase_caches_on(&mut self, base: &Self, spec: &ChainSpec) -> Result<(), Error> { // Use pubkey cache from `base` if it contains superior information (likely if our cache is - // uninitialized). + // uninitialized). Be careful not to use a cache which has *more* validators than expected, + // as other code expects `self.pubkey_cache().len() <= self.validators.len()`. let num_validators = self.validators().len(); let pubkey_cache = self.pubkey_cache_mut(); let base_pubkey_cache = base.pubkey_cache(); - if pubkey_cache.len() < base_pubkey_cache.len() && pubkey_cache.len() < num_validators { + + let current_cache_is_incomplete = pubkey_cache.len() < num_validators; + let base_cache_is_compatible = base_pubkey_cache.len() <= num_validators; + let base_cache_is_superior = base_pubkey_cache.len() > pubkey_cache.len(); + + if current_cache_is_incomplete && base_cache_is_compatible && base_cache_is_superior { *pubkey_cache = base_pubkey_cache.clone(); }