In-memory tree states (#5533)

* Consensus changes

* EF tests

* lcli

* common and watch

* account manager

* cargo

* fork choice

* promise cache

* beacon chain

* interop genesis

* http api

* lighthouse

* op pool

* beacon chain misc

* parallel state cache

* store

* fix issues in store

* IT COMPILES

* Remove some unnecessary module qualification

* Revert Arced pubkey optimization (#5536)

* Merge remote-tracking branch 'origin/unstable' into tree-states-memory

* Fix caching, rebasing and some tests

* Remove unused deps

* Merge remote-tracking branch 'origin/unstable' into tree-states-memory

* Small cleanups

* Revert shuffling cache/promise cache changes

* Fix state advance bugs

* Fix shuffling tests

* Remove some resolved FIXMEs

* Remove StateProcessingStrategy

* Optimise withdrawals calculation

* Don't reorg if state cache is missed

* Remove inconsistent state func

* Fix beta compiler

* Rebase early, rebase often

* Fix state caching behaviour

* Update to milhouse release

* Fix on-disk consensus context format

* Merge remote-tracking branch 'origin/unstable' into tree-states-memory

* Squashed commit of the following:

commit 3a16649023
Author: Michael Sproul <michael@sigmaprime.io>
Date:   Thu Apr 18 14:26:09 2024 +1000

    Fix on-disk consensus context format

* Keep indexed attestations, thanks Sean

* Merge branch 'on-disk-consensus-context' into tree-states-memory

* Merge branch 'unstable' into tree-states-memory

* Address half of Sean's review

* More simplifications from Sean's review

* Cache state after get_advanced_hot_state
This commit is contained in:
Michael Sproul
2024-04-24 11:22:36 +10:00
committed by GitHub
parent 4cad1fcbbe
commit 61962898e2
108 changed files with 2038 additions and 2762 deletions

View File

@@ -58,7 +58,6 @@ use crate::persisted_beacon_chain::{PersistedBeaconChain, DUMMY_CANONICAL_HEAD_B
use crate::persisted_fork_choice::PersistedForkChoice;
use crate::pre_finalization_cache::PreFinalizationBlockCache;
use crate::shuffling_cache::{BlockShufflingIds, ShufflingCache};
use crate::snapshot_cache::{BlockProductionPreState, SnapshotCache};
use crate::sync_committee_verification::{
Error as SyncCommitteeError, VerifiedSyncCommitteeMessage, VerifiedSyncContribution,
};
@@ -103,8 +102,7 @@ use state_processing::{
},
per_slot_processing,
state_advance::{complete_state_advance, partial_state_advance},
BlockSignatureStrategy, ConsensusContext, SigVerifiedOp, StateProcessingStrategy,
VerifyBlockRoot, VerifyOperation,
BlockSignatureStrategy, ConsensusContext, SigVerifiedOp, VerifyBlockRoot, VerifyOperation,
};
use std::borrow::Cow;
use std::cmp::Ordering;
@@ -130,9 +128,6 @@ pub type ForkChoiceError = fork_choice::Error<crate::ForkChoiceStoreError>;
/// Alias to appease clippy.
type HashBlockTuple<E> = (Hash256, RpcBlock<E>);
/// The time-out before failure during an operation to take a read/write RwLock on the block
/// processing cache.
pub const BLOCK_PROCESSING_CACHE_LOCK_TIMEOUT: Duration = Duration::from_secs(1);
/// The time-out before failure during an operation to take a read/write RwLock on the
/// attestation cache.
pub const ATTESTATION_CACHE_LOCK_TIMEOUT: Duration = Duration::from_secs(1);
@@ -176,6 +171,7 @@ pub const INVALID_FINALIZED_MERGE_TRANSITION_BLOCK_SHUTDOWN_REASON: &str =
"Finalized merge transition block is invalid.";
/// Defines the behaviour when a block/block-root for a skipped slot is requested.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum WhenSlotSkipped {
/// If the slot is a skip slot, return `None`.
///
@@ -450,8 +446,6 @@ pub struct BeaconChain<T: BeaconChainTypes> {
pub event_handler: Option<ServerSentEventHandler<T::EthSpec>>,
/// Used to track the heads of the beacon chain.
pub(crate) head_tracker: Arc<HeadTracker>,
/// A cache dedicated to block processing.
pub(crate) snapshot_cache: TimeoutRwLock<SnapshotCache<T::EthSpec>>,
/// Caches the attester shuffling for a given epoch and shuffling key root.
pub shuffling_cache: TimeoutRwLock<ShufflingCache>,
/// A cache of eth1 deposit data at epoch boundaries for deposit finalization
@@ -492,11 +486,6 @@ pub struct BeaconChain<T: BeaconChainTypes> {
pub data_availability_checker: Arc<DataAvailabilityChecker<T>>,
/// The KZG trusted setup used by this chain.
pub kzg: Option<Arc<Kzg>>,
/// State with complete tree hash cache, ready for block production.
///
/// NB: We can delete this once we have tree-states.
#[allow(clippy::type_complexity)]
pub block_production_state: Arc<Mutex<Option<(Hash256, BlockProductionPreState<T::EthSpec>)>>>,
}
pub enum BeaconBlockResponseWrapper<E: EthSpec> {
@@ -773,7 +762,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
let iter = self.store.forwards_block_roots_iterator(
start_slot,
local_head.beacon_state.clone_with(CloneConfig::none()),
local_head.beacon_state.clone(),
local_head.beacon_block_root,
&self.spec,
)?;
@@ -803,12 +792,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
let iter = self.store.forwards_block_roots_iterator_until(
start_slot,
end_slot,
|| {
Ok((
head.beacon_state.clone_with_only_committee_caches(),
head.beacon_block_root,
))
},
|| Ok((head.beacon_state.clone(), head.beacon_block_root)),
&self.spec,
)?;
Ok(iter
@@ -879,7 +863,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
let iter = self.store.forwards_state_roots_iterator(
start_slot,
local_head.beacon_state_root(),
local_head.beacon_state.clone_with(CloneConfig::none()),
local_head.beacon_state.clone(),
&self.spec,
)?;
@@ -900,12 +884,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
let iter = self.store.forwards_state_roots_iterator_until(
start_slot,
end_slot,
|| {
Ok((
head.beacon_state.clone_with_only_committee_caches(),
head.beacon_state_root(),
))
},
|| Ok((head.beacon_state.clone(), head.beacon_state_root())),
&self.spec,
)?;
Ok(iter
@@ -3561,29 +3540,6 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
});
}
self.snapshot_cache
.try_write_for(BLOCK_PROCESSING_CACHE_LOCK_TIMEOUT)
.ok_or(Error::SnapshotCacheLockTimeout)
.map(|mut snapshot_cache| {
snapshot_cache.insert(
BeaconSnapshot {
beacon_state: state,
beacon_block: signed_block.clone(),
beacon_block_root: block_root,
},
None,
&self.spec,
)
})
.unwrap_or_else(|e| {
error!(
self.log,
"Failed to insert snapshot";
"error" => ?e,
"task" => "process block"
);
});
self.head_tracker
.register_block(block_root, parent_root, slot);
@@ -4145,22 +4101,22 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
self.wait_for_fork_choice_before_block_production(slot)?;
drop(fork_choice_timer);
// Producing a block requires the tree hash cache, so clone a full state corresponding to
// the head from the snapshot cache. Unfortunately we can't move the snapshot out of the
// cache (which would be fast), because we need to re-process the block after it has been
// signed. If we miss the cache or we're producing a block that conflicts with the head,
// fall back to getting the head from `slot - 1`.
let state_load_timer = metrics::start_timer(&metrics::BLOCK_PRODUCTION_STATE_LOAD_TIMES);
// Atomically read some values from the head whilst avoiding holding cached head `Arc` any
// longer than necessary.
let (head_slot, head_block_root) = {
let (head_slot, head_block_root, head_state_root) = {
let head = self.canonical_head.cached_head();
(head.head_slot(), head.head_block_root())
(
head.head_slot(),
head.head_block_root(),
head.head_state_root(),
)
};
let (state, state_root_opt) = if head_slot < slot {
// Attempt an aggressive re-org if configured and the conditions are right.
if let Some(re_org_state) = self.get_state_for_re_org(slot, head_slot, head_block_root)
if let Some((re_org_state, re_org_state_root)) =
self.get_state_for_re_org(slot, head_slot, head_block_root)
{
info!(
self.log,
@@ -4168,37 +4124,16 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
"slot" => slot,
"head_to_reorg" => %head_block_root,
);
(re_org_state.pre_state, re_org_state.state_root)
}
// Normal case: proposing a block atop the current head using the cache.
else if let Some((_, cached_state)) =
self.get_state_from_block_production_cache(head_block_root)
{
(cached_state.pre_state, cached_state.state_root)
}
// Fall back to a direct read of the snapshot cache.
else if let Some(pre_state) =
self.get_state_from_snapshot_cache_for_block_production(head_block_root)
{
warn!(
self.log,
"Block production cache miss";
"message" => "falling back to snapshot cache clone",
"slot" => slot
);
(pre_state.pre_state, pre_state.state_root)
(re_org_state, Some(re_org_state_root))
} else {
warn!(
self.log,
"Block production cache miss";
"message" => "this block is more likely to be orphaned",
"slot" => slot,
);
let state = self
.state_at_slot(slot - 1, StateSkipConfig::WithStateRoots)
.map_err(|_| BlockProductionError::UnableToProduceAtSlot(slot))?;
(state, None)
// Fetch the head state advanced through to `slot`, which should be present in the
// state cache thanks to the state advance timer.
let (state_root, state) = self
.store
.get_advanced_hot_state(head_block_root, slot, head_state_root)
.map_err(BlockProductionError::FailedToLoadState)?
.ok_or(BlockProductionError::UnableToProduceAtSlot(slot))?;
(state, Some(state_root))
}
} else {
warn!(
@@ -4219,40 +4154,6 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
Ok((state, state_root_opt))
}
/// Get the state cached for block production *if* it matches `head_block_root`.
///
/// This will clear the cache regardless of whether the block root matches, so only call this if
/// you think the `head_block_root` is likely to match!
fn get_state_from_block_production_cache(
&self,
head_block_root: Hash256,
) -> Option<(Hash256, BlockProductionPreState<T::EthSpec>)> {
// Take care to drop the lock as quickly as possible.
let mut lock = self.block_production_state.lock();
let result = lock
.take()
.filter(|(cached_block_root, _)| *cached_block_root == head_block_root);
drop(lock);
result
}
/// Get a state for block production from the snapshot cache.
fn get_state_from_snapshot_cache_for_block_production(
&self,
head_block_root: Hash256,
) -> Option<BlockProductionPreState<T::EthSpec>> {
if let Some(lock) = self
.snapshot_cache
.try_read_for(BLOCK_PROCESSING_CACHE_LOCK_TIMEOUT)
{
let result = lock.get_state_for_block_production(head_block_root);
drop(lock);
result
} else {
None
}
}
/// Fetch the beacon state to use for producing a block if a 1-slot proposer re-org is viable.
///
/// This function will return `None` if proposer re-orgs are disabled.
@@ -4261,7 +4162,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
slot: Slot,
head_slot: Slot,
canonical_head: Hash256,
) -> Option<BlockProductionPreState<T::EthSpec>> {
) -> Option<(BeaconState<T::EthSpec>, Hash256)> {
let re_org_head_threshold = self.config.re_org_head_threshold?;
let re_org_parent_threshold = self.config.re_org_parent_threshold?;
@@ -4345,26 +4246,14 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
drop(proposer_head_timer);
let re_org_parent_block = proposer_head.parent_node.root;
// Only attempt a re-org if we hit the block production cache or snapshot cache.
let pre_state = self
.get_state_from_block_production_cache(re_org_parent_block)
.map(|(_, state)| state)
let (state_root, state) = self
.store
.get_advanced_hot_state_from_cache(re_org_parent_block, slot)
.or_else(|| {
warn!(
self.log,
"Block production cache miss";
"message" => "falling back to snapshot cache during re-org",
"slot" => slot,
"block_root" => ?re_org_parent_block
);
self.get_state_from_snapshot_cache_for_block_production(re_org_parent_block)
})
.or_else(|| {
debug!(
self.log,
"Not attempting re-org";
"reason" => "missed snapshot cache",
"parent_block" => ?re_org_parent_block,
"reason" => "no state in cache"
);
None
})?;
@@ -4378,7 +4267,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
"threshold_weight" => proposer_head.re_org_head_weight_threshold
);
Some(pre_state)
Some((state, state_root))
}
/// Get the proposer index and `prev_randao` value for a proposal at slot `proposal_slot`.
@@ -4506,20 +4395,6 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
let (unadvanced_state, unadvanced_state_root) =
if cached_head.head_block_root() == parent_block_root {
(Cow::Borrowed(head_state), cached_head.head_state_root())
} else if let Some(snapshot) = self
.snapshot_cache
.try_read_for(BLOCK_PROCESSING_CACHE_LOCK_TIMEOUT)
.ok_or(Error::SnapshotCacheLockTimeout)?
.get_cloned(parent_block_root, CloneConfig::none())
{
debug!(
self.log,
"Hit snapshot cache during withdrawals calculation";
"slot" => proposal_slot,
"parent_block_root" => ?parent_block_root,
);
let state_root = snapshot.beacon_state_root();
(Cow::Owned(snapshot.beacon_state), state_root)
} else {
info!(
self.log,
@@ -4530,10 +4405,11 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
let block = self
.get_blinded_block(&parent_block_root)?
.ok_or(Error::MissingBeaconBlock(parent_block_root))?;
let state = self
.get_state(&block.state_root(), Some(block.slot()))?
let (state_root, state) = self
.store
.get_advanced_hot_state(parent_block_root, proposal_slot, block.state_root())?
.ok_or(Error::MissingBeaconState(block.state_root()))?;
(Cow::Owned(state), block.state_root())
(Cow::Owned(state), state_root)
};
// Parent state epoch is the same as the proposal, we don't need to advance because the
@@ -4910,6 +4786,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
drop(slot_timer);
state.build_committee_cache(RelativeEpoch::Current, &self.spec)?;
state.apply_pending_mutations()?;
let parent_root = if state.slot() > 0 {
*state
@@ -5383,7 +5260,6 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
&mut state,
&block,
signature_strategy,
StateProcessingStrategy::Accurate,
VerifyBlockRoot::True,
&mut ctxt,
&self.spec,
@@ -6308,6 +6184,17 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
"head_block_root" => head_block_root.to_string(),
);
// If the block's state will be so far ahead of `shuffling_epoch` that even its
// previous epoch committee cache will be too new, then error. Callers of this function
// shouldn't be requesting such old shufflings for this `head_block_root`.
let head_block_epoch = head_block.slot.epoch(T::EthSpec::slots_per_epoch());
if head_block_epoch > shuffling_epoch + 1 {
return Err(Error::InvalidStateForShuffling {
state_epoch: head_block_epoch,
shuffling_epoch,
});
}
let state_read_timer =
metrics::start_timer(&metrics::ATTESTATION_PROCESSING_STATE_READ_TIMES);
@@ -6318,71 +6205,52 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
// to copy the head is liable to race-conditions.
let head_state_opt = self.with_head(|head| {
if head.beacon_block_root == head_block_root {
Ok(Some((
head.beacon_state
.clone_with(CloneConfig::committee_caches_only()),
head.beacon_state_root(),
)))
Ok(Some((head.beacon_state.clone(), head.beacon_state_root())))
} else {
Ok::<_, Error>(None)
}
})?;
// Compute the `target_slot` to advance the block's state to.
//
// Since there's a one-epoch look-ahead on the attester shuffling, it suffices to
// only advance into the first slot of the epoch prior to `shuffling_epoch`.
//
// If the `head_block` is already ahead of that slot, then we should load the state
// at that slot, as we've determined above that the `shuffling_epoch` cache will
// not be too far in the past.
let target_slot = std::cmp::max(
shuffling_epoch
.saturating_sub(1_u64)
.start_slot(T::EthSpec::slots_per_epoch()),
head_block.slot,
);
// If the head state is useful for this request, use it. Otherwise, read a state from
// disk.
// disk that is advanced as close as possible to `target_slot`.
let (mut state, state_root) = if let Some((state, state_root)) = head_state_opt {
(state, state_root)
} else {
let block_state_root = head_block.state_root;
let max_slot = shuffling_epoch.start_slot(T::EthSpec::slots_per_epoch());
let (state_root, state) = self
.store
.get_inconsistent_state_for_attestation_verification_only(
&head_block_root,
max_slot,
block_state_root,
)?
.ok_or(Error::MissingBeaconState(block_state_root))?;
.get_advanced_hot_state(head_block_root, target_slot, head_block.state_root)?
.ok_or(Error::MissingBeaconState(head_block.state_root))?;
(state, state_root)
};
/*
* IMPORTANT
*
* Since it's possible that
* `Store::get_inconsistent_state_for_attestation_verification_only` was used to obtain
* the state, we cannot rely upon the following fields:
*
* - `state.state_roots`
* - `state.block_roots`
*
* These fields should not be used for the rest of this function.
*/
metrics::stop_timer(state_read_timer);
let state_skip_timer =
metrics::start_timer(&metrics::ATTESTATION_PROCESSING_STATE_SKIP_TIMES);
// If the state is in an earlier epoch, advance it. If it's from a later epoch, reject
// it.
// If the state is still in an earlier epoch, advance it to the `target_slot` so
// that its next epoch committee cache matches the `shuffling_epoch`.
if state.current_epoch() + 1 < shuffling_epoch {
// Since there's a one-epoch look-ahead on the attester shuffling, it suffices to
// only advance into the slot prior to the `shuffling_epoch`.
let target_slot = shuffling_epoch
.saturating_sub(1_u64)
.start_slot(T::EthSpec::slots_per_epoch());
// Advance the state into the required slot, using the "partial" method since the state
// roots are not relevant for the shuffling.
// Advance the state into the required slot, using the "partial" method since the
// state roots are not relevant for the shuffling.
partial_state_advance(&mut state, Some(state_root), target_slot, &self.spec)?;
} else if state.current_epoch() > shuffling_epoch {
return Err(Error::InvalidStateForShuffling {
state_epoch: state.current_epoch(),
shuffling_epoch,
});
}
metrics::stop_timer(state_skip_timer);
let committee_building_timer =
metrics::start_timer(&metrics::ATTESTATION_PROCESSING_COMMITTEE_BUILDING_TIMES);
@@ -6391,8 +6259,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
state.build_committee_cache(relative_epoch, &self.spec)?;
let committee_cache = state.take_committee_cache(relative_epoch)?;
let committee_cache = Arc::new(committee_cache);
let committee_cache = state.committee_cache(relative_epoch)?.clone();
let shuffling_decision_block = shuffling_id.shuffling_decision_block;
self.shuffling_cache
@@ -6412,29 +6279,27 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
///
/// This could be a very expensive operation and should only be done in testing/analysis
/// activities.
///
/// This dump function previously used a backwards iterator but has been swapped to a forwards
/// iterator as it allows for MUCH better caching and rebasing. Memory usage of some tests went
/// from 5GB per test to 90MB.
#[allow(clippy::type_complexity)]
pub fn chain_dump(
&self,
) -> Result<Vec<BeaconSnapshot<T::EthSpec, BlindedPayload<T::EthSpec>>>, Error> {
let mut dump = vec![];
let mut last_slot = {
let head = self.canonical_head.cached_head();
BeaconSnapshot {
beacon_block: Arc::new(head.snapshot.beacon_block.clone_as_blinded()),
beacon_block_root: head.snapshot.beacon_block_root,
beacon_state: head.snapshot.beacon_state.clone(),
}
};
dump.push(last_slot.clone());
loop {
let beacon_block_root = last_slot.beacon_block.parent_root();
if beacon_block_root == Hash256::zero() {
break; // Genesis has been reached.
let mut prev_block_root = None;
let mut prev_beacon_state = None;
for res in self.forwards_iter_block_roots(Slot::new(0))? {
let (beacon_block_root, _) = res?;
// Do not include snapshots at skipped slots.
if Some(beacon_block_root) == prev_block_root {
continue;
}
prev_block_root = Some(beacon_block_root);
let beacon_block = self
.store
@@ -6443,25 +6308,31 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
Error::DBInconsistent(format!("Missing block {}", beacon_block_root))
})?;
let beacon_state_root = beacon_block.state_root();
let beacon_state = self
let mut beacon_state = self
.store
.get_state(&beacon_state_root, Some(beacon_block.slot()))?
.ok_or_else(|| {
Error::DBInconsistent(format!("Missing state {:?}", beacon_state_root))
})?;
let slot = BeaconSnapshot {
// This beacon state might come from the freezer DB, which means it could have pending
// updates or lots of untethered memory. We rebase it on the previous state in order to
// address this.
beacon_state.apply_pending_mutations()?;
if let Some(prev) = prev_beacon_state {
beacon_state.rebase_on(&prev, &self.spec)?;
}
beacon_state.build_caches(&self.spec)?;
prev_beacon_state = Some(beacon_state.clone());
let snapshot = BeaconSnapshot {
beacon_block: Arc::new(beacon_block),
beacon_block_root,
beacon_state,
};
dump.push(slot.clone());
last_slot = slot;
dump.push(snapshot);
}
dump.reverse();
Ok(dump)
}
@@ -6696,6 +6567,10 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
self.data_availability_checker.data_availability_boundary()
}
pub fn logger(&self) -> &Logger {
&self.log
}
/// Gets the `LightClientBootstrap` object for a requested block root.
///
/// Returns `None` when the state or block is not found in the database.