Fix data race in load_hot_state

This was found in the wild on a node that was reconstructing states and had a ~350 epoch gap between finalization and the split slot. When the finalization migration ran it deleted some of the hot state summaries that were still being iterated by `load_hot_state`, causing state storage to fail and the database to become corrupt (due to the atomicity bug from `unstable`). This commit additonally adds states created from diffs to the cache. This should help avoid iterating back so far but is strictly a performance improvement and *not* a correctness fix.
2026-05-09 11:25:55 +00:00 · 2023-01-31 12:49:29 +11:00
parent dbb0cf7099
commit 2f6ffff8d6
1 changed files with 17 additions and 1 deletions
--- a/beacon_node/store/src/hot_cold_store.rs
+++ b/beacon_node/store/src/hot_cold_store.rs
@@ -1015,6 +1015,11 @@ impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> HotColdDB<E, Hot, Cold>
            return Ok(None);
        }

+        // Take a read lock on the split point while we load data from prior states. We need
+        // to prevent the finalization migration from deleting the state summaries and state diffs
+        // that we are iterating back through.
+        let split_read_lock = self.split.read_recursive();
+
        // Backtrack until we reach a state that is in the cache, or in the worst case
        // the finalized state (this should only be reachable on first start-up).
        let state_summary_iter = HotStateRootIter::new(self, target_slot, *state_root);
@@ -1052,7 +1057,7 @@ impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> HotColdDB<E, Hot, Cold>

            // If the prior state is the split state and it isn't cached then load it in
            // entirety from disk. This should only happen on first start up.
-            if prior_state_root == self.get_split_info().state_root {
+            if prior_state_root == split_read_lock.state_root {
                debug!(
                    self.log,
                    "Using split state as base state for replay";
@@ -1075,6 +1080,9 @@ impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> HotColdDB<E, Hot, Cold>

        let (_, mut state) = base_state.ok_or(Error::NoBaseStateFound(*state_root))?;

+        // Finished reading information about prior states, allow the split point to update.
+        drop(split_read_lock);
+
        // Construct a mutable iterator for the state roots, which will be iterated through
        // consecutive calls to `replay_blocks`.
        let mut state_roots_iter = state_roots.into_iter();
@@ -1182,6 +1190,7 @@ impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> HotColdDB<E, Hot, Cold>

            // Apply state diff. Block replay should have ensured that the diff is now applicable.
            if let Some((summary, to_root, diff)) = next_state_diff {
+                let block_root = summary.latest_block_root;
                debug!(
                    self.log,
                    "Applying state diff";
@@ -1189,6 +1198,7 @@ impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> HotColdDB<E, Hot, Cold>
                    "from_slot" => summary.diff_base_slot,
                    "to_root" => ?to_root,
                    "to_slot" => summary.slot,
+                    "block_root" => ?block_root,
                );
                debug_assert_eq!(summary.diff_base_slot, state.slot());

@@ -1196,6 +1206,12 @@ impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> HotColdDB<E, Hot, Cold>

                state.update_tree_hash_cache()?;
                state.build_all_caches(&self.spec)?;
+
+                // Add state to the cache, it is by definition an epoch boundary state and likely
+                // to be useful.
+                self.state_cache
+                    .lock()
+                    .put_state(to_root, block_root, &state)?;
            }
        }