diff --git a/Cargo.lock b/Cargo.lock index 31cccc6a98..516d0df358 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1235,10 +1235,12 @@ dependencies = [ "eth2", "ethereum_ssz", "lighthouse_version", + "mockito", "reqwest 0.11.27", "sensitive_url", "serde", "serde_json", + "tokio", ] [[package]] @@ -2581,6 +2583,18 @@ dependencies = [ "sha2 0.10.8", ] +[[package]] +name = "eip_3076" +version = "0.1.0" +dependencies = [ + "arbitrary", + "ethereum_serde_utils", + "serde", + "serde_json", + "tempfile", + "types", +] + [[package]] name = "either" version = "1.15.0" @@ -2848,6 +2862,7 @@ name = "eth2" version = "0.1.0" dependencies = [ "derivative", + "eip_3076", "either", "enr", "eth2_keystore", @@ -2867,7 +2882,6 @@ dependencies = [ "sensitive_url", "serde", "serde_json", - "slashing_protection", "ssz_types", "test_random_derive", "tokio", @@ -8832,6 +8846,7 @@ name = "slashing_protection" version = "0.1.0" dependencies = [ "arbitrary", + "eip_3076", "ethereum_serde_utils", "filesystem", "r2d2", diff --git a/Cargo.toml b/Cargo.toml index a46dc355e7..ae84d645bb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,7 @@ members = [ "common/compare_fields_derive", "common/deposit_contract", "common/directory", + "common/eip_3076", "common/eth2", "common/eth2_config", "common/eth2_interop_keypairs", @@ -135,6 +136,7 @@ directory = { path = "common/directory" } dirs = "3" discv5 = { version = "0.10", features = ["libp2p"] } doppelganger_service = { path = "validator_client/doppelganger_service" } +eip_3076 = { path = "common/eip_3076" } either = "1.9" environment = { path = "lighthouse/environment" } eth2 = { path = "common/eth2" } diff --git a/beacon_node/Cargo.toml b/beacon_node/Cargo.toml index 8e2c598fd4..985f4c1752 100644 --- a/beacon_node/Cargo.toml +++ b/beacon_node/Cargo.toml @@ -15,6 +15,7 @@ path = "src/lib.rs" write_ssz_files = [ "beacon_chain/write_ssz_files", ] # Writes debugging .ssz files to /tmp during block processing. +testing = [] # Enables testing-only CLI flags [dependencies] account_utils = { workspace = true } diff --git a/beacon_node/beacon_chain/src/beacon_chain.rs b/beacon_node/beacon_chain/src/beacon_chain.rs index 760e447b75..5a82dc70c5 100644 --- a/beacon_node/beacon_chain/src/beacon_chain.rs +++ b/beacon_node/beacon_chain/src/beacon_chain.rs @@ -21,6 +21,7 @@ use crate::block_verification_types::{ }; pub use crate::canonical_head::CanonicalHead; use crate::chain_config::ChainConfig; +use crate::custody_context::CustodyContextSsz; use crate::data_availability_checker::{ Availability, AvailabilityCheckError, AvailableBlock, AvailableBlockData, DataAvailabilityChecker, DataColumnReconstructionResult, @@ -64,7 +65,6 @@ use crate::shuffling_cache::{BlockShufflingIds, ShufflingCache}; use crate::sync_committee_verification::{ Error as SyncCommitteeError, VerifiedSyncCommitteeMessage, VerifiedSyncContribution, }; -use crate::validator_custody::CustodyContextSsz; use crate::validator_monitor::{ HISTORIC_EPOCHS as VALIDATOR_MONITOR_HISTORIC_EPOCHS, ValidatorMonitor, get_slot_delay_ms, timestamp_now, @@ -3564,7 +3564,7 @@ impl BeaconChain { .await } - fn check_blobs_for_slashability<'a>( + fn check_blob_header_signature_and_slashability<'a>( self: &Arc, block_root: Hash256, blobs: impl IntoIterator>, @@ -3575,17 +3575,20 @@ impl BeaconChain { .map(|b| b.signed_block_header.clone()) .unique() { - if verify_header_signature::(self, &header).is_ok() { - slashable_cache - .observe_slashable( - header.message.slot, - header.message.proposer_index, - block_root, - ) - .map_err(|e| BlockError::BeaconChainError(Box::new(e.into())))?; - if let Some(slasher) = self.slasher.as_ref() { - slasher.accept_block_header(header); - } + // Return an error if *any* header signature is invalid, we do not want to import this + // list of blobs into the DA checker. However, we will process any valid headers prior + // to the first invalid header in the slashable cache & slasher. + verify_header_signature::(self, &header)?; + + slashable_cache + .observe_slashable( + header.message.slot, + header.message.proposer_index, + block_root, + ) + .map_err(|e| BlockError::BeaconChainError(Box::new(e.into())))?; + if let Some(slasher) = self.slasher.as_ref() { + slasher.accept_block_header(header); } } Ok(()) @@ -3599,7 +3602,10 @@ impl BeaconChain { block_root: Hash256, blobs: FixedBlobSidecarList, ) -> Result { - self.check_blobs_for_slashability(block_root, blobs.iter().flatten().map(Arc::as_ref))?; + self.check_blob_header_signature_and_slashability( + block_root, + blobs.iter().flatten().map(Arc::as_ref), + )?; let availability = self .data_availability_checker .put_rpc_blobs(block_root, blobs)?; @@ -3616,12 +3622,15 @@ impl BeaconChain { ) -> Result { let availability = match engine_get_blobs_output { EngineGetBlobsOutput::Blobs(blobs) => { - self.check_blobs_for_slashability(block_root, blobs.iter().map(|b| b.as_blob()))?; + self.check_blob_header_signature_and_slashability( + block_root, + blobs.iter().map(|b| b.as_blob()), + )?; self.data_availability_checker .put_kzg_verified_blobs(block_root, blobs)? } EngineGetBlobsOutput::CustodyColumns(data_columns) => { - self.check_columns_for_slashability( + self.check_data_column_sidecar_header_signature_and_slashability( block_root, data_columns.iter().map(|c| c.as_data_column()), )?; @@ -3642,7 +3651,7 @@ impl BeaconChain { block_root: Hash256, custody_columns: DataColumnSidecarList, ) -> Result { - self.check_columns_for_slashability( + self.check_data_column_sidecar_header_signature_and_slashability( block_root, custody_columns.iter().map(|c| c.as_ref()), )?; @@ -3659,7 +3668,7 @@ impl BeaconChain { .await } - fn check_columns_for_slashability<'a>( + fn check_data_column_sidecar_header_signature_and_slashability<'a>( self: &Arc, block_root: Hash256, custody_columns: impl IntoIterator>, @@ -3673,17 +3682,20 @@ impl BeaconChain { .map(|c| c.signed_block_header.clone()) .unique() { - if verify_header_signature::(self, &header).is_ok() { - slashable_cache - .observe_slashable( - header.message.slot, - header.message.proposer_index, - block_root, - ) - .map_err(|e| BlockError::BeaconChainError(Box::new(e.into())))?; - if let Some(slasher) = self.slasher.as_ref() { - slasher.accept_block_header(header); - } + // Return an error if *any* header signature is invalid, we do not want to import this + // list of blobs into the DA checker. However, we will process any valid headers prior + // to the first invalid header in the slashable cache & slasher. + verify_header_signature::(self, &header)?; + + slashable_cache + .observe_slashable( + header.message.slot, + header.message.proposer_index, + block_root, + ) + .map_err(|e| BlockError::BeaconChainError(Box::new(e.into())))?; + if let Some(slasher) = self.slasher.as_ref() { + slasher.accept_block_header(header); } } Ok(()) @@ -4726,6 +4738,11 @@ impl BeaconChain { // efficient packing of execution blocks. Err(Error::SkipProposerPreparation) } else { + debug!( + ?shuffling_decision_root, + epoch = %proposal_epoch, + "Proposer shuffling cache miss for proposer prep" + ); let head = self.canonical_head.cached_head(); Ok(( head.head_state_root(), @@ -6516,6 +6533,26 @@ impl BeaconChain { } } + /// This function provides safe and efficient multi-threaded access to the beacon proposer cache. + /// + /// The arguments are: + /// + /// - `shuffling_decision_block`: The block root of the decision block for the desired proposer + /// shuffling. This should be computed using one of the methods for computing proposer + /// shuffling decision roots, e.g. `BeaconState::proposer_shuffling_decision_root_at_epoch`. + /// - `proposal_epoch`: The epoch at which the proposer shuffling is required. + /// - `accessor`: A closure to run against the proposers for the selected epoch. Usually this + /// closure just grabs a single proposer, or takes the vec of proposers for the epoch. + /// - `state_provider`: A closure to compute a state suitable for determining the shuffling. + /// This closure is evaluated lazily ONLY in the case that a cache miss occurs. It is + /// recommended for code that wants to keep track of cache misses to produce a log and/or + /// increment a metric inside this closure . + /// + /// This function makes use of closures in order to efficiently handle concurrent accesses to + /// the cache. + /// + /// The error type is polymorphic, if in doubt you can use `BeaconChainError`. You might need + /// to use a turbofish if type inference can't work it out. pub fn with_proposer_cache + From>( &self, shuffling_decision_block: Hash256, @@ -6534,12 +6571,6 @@ impl BeaconChain { // If it is already initialised, then `get_or_try_init` will return immediately without // executing the initialisation code at all. let epoch_block_proposers = cache_entry.get_or_try_init(|| { - debug!( - ?shuffling_decision_block, - %proposal_epoch, - "Proposer shuffling cache miss" - ); - // Fetch the state on-demand if the required epoch was missing from the cache. // If the caller wants to not compute the state they must return an error here and then // catch it at the call site. @@ -6569,11 +6600,18 @@ impl BeaconChain { } let proposers = state.get_beacon_proposer_indices(proposal_epoch, &self.spec)?; - Ok::<_, E>(EpochBlockProposers::new( - proposal_epoch, - state.fork(), - proposers, - )) + + // Use fork_at_epoch rather than the state's fork, because post-Fulu we may not have + // advanced the state completely into the new epoch. + let fork = self.spec.fork_at_epoch(proposal_epoch); + + debug!( + ?shuffling_decision_block, + epoch = %proposal_epoch, + "Priming proposer shuffling cache" + ); + + Ok::<_, E>(EpochBlockProposers::new(proposal_epoch, fork, proposers)) })?; // Run the accessor function on the computed epoch proposers. @@ -6867,9 +6905,138 @@ impl BeaconChain { pub fn update_data_column_custody_info(&self, slot: Option) { self.store .put_data_column_custody_info(slot) - .unwrap_or_else( - |e| tracing::error!(error = ?e, "Failed to update data column custody info"), + .unwrap_or_else(|e| error!(error = ?e, "Failed to update data column custody info")); + } + + /// Get the earliest epoch in which the node has met its custody requirements. + /// A `None` response indicates that we've met our custody requirements up to the + /// column data availability window + pub fn earliest_custodied_data_column_epoch(&self) -> Option { + self.store + .get_data_column_custody_info() + .inspect_err( + |e| error!(error=?e, "Failed to get data column custody info from the store"), + ) + .ok() + .flatten() + .and_then(|info| info.earliest_data_column_slot) + .map(|slot| { + let mut epoch = slot.epoch(T::EthSpec::slots_per_epoch()); + // If the earliest custodied slot isn't the first slot in the epoch + // The node has only met its custody requirements for the next epoch. + if slot > epoch.start_slot(T::EthSpec::slots_per_epoch()) { + epoch += 1; + } + epoch + }) + } + + /// The data availability boundary for custodying columns. It will just be the + /// regular data availability boundary unless we are near the Fulu fork epoch. + pub fn column_data_availability_boundary(&self) -> Option { + match self.data_availability_boundary() { + Some(da_boundary_epoch) => { + if let Some(fulu_fork_epoch) = self.spec.fulu_fork_epoch { + if da_boundary_epoch < fulu_fork_epoch { + Some(fulu_fork_epoch) + } else { + Some(da_boundary_epoch) + } + } else { + None // Fulu hasn't been enabled + } + } + None => None, // Deneb hasn't been enabled + } + } + + /// Safely update data column custody info by ensuring that: + /// - cgc values at the updated epoch and the earliest custodied column epoch are equal + /// - we are only decrementing the earliest custodied data column epoch by one epoch + /// - the new earliest data column slot is set to the first slot in `effective_epoch`. + pub fn safely_backfill_data_column_custody_info( + &self, + effective_epoch: Epoch, + ) -> Result<(), Error> { + let Some(earliest_data_column_epoch) = self.earliest_custodied_data_column_epoch() else { + return Ok(()); + }; + + if effective_epoch >= earliest_data_column_epoch { + return Ok(()); + } + + let cgc_at_effective_epoch = self + .data_availability_checker + .custody_context() + .custody_group_count_at_epoch(effective_epoch, &self.spec); + + let cgc_at_earliest_data_colum_epoch = self + .data_availability_checker + .custody_context() + .custody_group_count_at_epoch(earliest_data_column_epoch, &self.spec); + + let can_update_data_column_custody_info = cgc_at_effective_epoch + == cgc_at_earliest_data_colum_epoch + && effective_epoch == earliest_data_column_epoch - 1; + + if can_update_data_column_custody_info { + self.store.put_data_column_custody_info(Some( + effective_epoch.start_slot(T::EthSpec::slots_per_epoch()), + ))?; + } else { + error!( + ?cgc_at_effective_epoch, + ?cgc_at_earliest_data_colum_epoch, + ?effective_epoch, + ?earliest_data_column_epoch, + "Couldn't update data column custody info" ); + return Err(Error::FailedColumnCustodyInfoUpdate); + } + + Ok(()) + } + + /// Compare columns custodied for `epoch` versus columns custodied for the head of the chain + /// and return any column indices that are missing. + pub fn get_missing_columns_for_epoch(&self, epoch: Epoch) -> HashSet { + let custody_context = self.data_availability_checker.custody_context(); + + let columns_required = custody_context + .custody_columns_for_epoch(None, &self.spec) + .iter() + .cloned() + .collect::>(); + + let current_columns_at_epoch = custody_context + .custody_columns_for_epoch(Some(epoch), &self.spec) + .iter() + .cloned() + .collect::>(); + + columns_required + .difference(¤t_columns_at_epoch) + .cloned() + .collect::>() + } + + /// The da boundary for custodying columns. It will just be the DA boundary unless we are near the Fulu fork epoch. + pub fn get_column_da_boundary(&self) -> Option { + match self.data_availability_boundary() { + Some(da_boundary_epoch) => { + if let Some(fulu_fork_epoch) = self.spec.fulu_fork_epoch { + if da_boundary_epoch < fulu_fork_epoch { + Some(fulu_fork_epoch) + } else { + Some(da_boundary_epoch) + } + } else { + None + } + } + None => None, // If no DA boundary set, dont try to custody backfill + } } /// This method serves to get a sense of the current chain health. It is used in block proposal diff --git a/beacon_node/beacon_chain/src/beacon_proposer_cache.rs b/beacon_node/beacon_chain/src/beacon_proposer_cache.rs index a64b4981cc..6effce49f8 100644 --- a/beacon_node/beacon_chain/src/beacon_proposer_cache.rs +++ b/beacon_node/beacon_chain/src/beacon_proposer_cache.rs @@ -17,6 +17,7 @@ use smallvec::SmallVec; use state_processing::state_advance::partial_state_advance; use std::num::NonZeroUsize; use std::sync::Arc; +use tracing::instrument; use types::non_zero_usize::new_non_zero_usize; use types::{ BeaconState, BeaconStateError, ChainSpec, Epoch, EthSpec, Fork, Hash256, Slot, Unsigned, @@ -199,11 +200,14 @@ pub fn compute_proposer_duties_from_head( .map_err(BeaconChainError::from)?; let dependent_root = state - // The only block which decides its own shuffling is the genesis block. - .proposer_shuffling_decision_root(chain.genesis_block_root, &chain.spec) + .proposer_shuffling_decision_root_at_epoch(request_epoch, head_block_root, &chain.spec) .map_err(BeaconChainError::from)?; - Ok((indices, dependent_root, execution_status, state.fork())) + // Use fork_at_epoch rather than the state's fork, because post-Fulu we may not have advanced + // the state completely into the new epoch. + let fork = chain.spec.fork_at_epoch(request_epoch); + + Ok((indices, dependent_root, execution_status, fork)) } /// If required, advance `state` to the epoch required to determine proposer indices in `target_epoch`. @@ -214,6 +218,7 @@ pub fn compute_proposer_duties_from_head( /// - No-op if `state.current_epoch() == target_epoch`. /// - It must be the case that `state.canonical_root() == state_root`, but this function will not /// check that. +#[instrument(skip_all, fields(?state_root, %target_epoch, state_slot = %state.slot()), level = "debug")] pub fn ensure_state_can_determine_proposers_for_epoch( state: &mut BeaconState, state_root: Hash256, @@ -234,14 +239,6 @@ pub fn ensure_state_can_determine_proposers_for_epoch( if state.current_epoch() > maximum_epoch { Err(BeaconStateError::SlotOutOfBounds.into()) } else if state.current_epoch() >= minimum_epoch { - if target_epoch > state.current_epoch() { - let target_slot = target_epoch.start_slot(E::slots_per_epoch()); - - // Advance the state into the same epoch as the block. Use the "partial" method since state - // roots are not important for proposer/attester shuffling. - partial_state_advance(state, Some(state_root), target_slot, spec) - .map_err(BeaconChainError::from)?; - } Ok(()) } else { // State's current epoch is less than the minimum epoch. diff --git a/beacon_node/beacon_chain/src/block_verification.rs b/beacon_node/beacon_chain/src/block_verification.rs index d0ed8258e5..691293b200 100644 --- a/beacon_node/beacon_chain/src/block_verification.rs +++ b/beacon_node/beacon_chain/src/block_verification.rs @@ -950,8 +950,6 @@ impl GossipVerifiedBlock { let proposer_shuffling_decision_block = parent_block.proposer_shuffling_root_for_child_block(block_epoch, &chain.spec); - // We assign to a variable instead of using `if let Some` directly to ensure we drop the - // write lock before trying to acquire it again in the `else` clause. let block_slot = block.slot(); let mut opt_parent = None; let proposer = chain.with_proposer_cache::<_, BlockError>( diff --git a/beacon_node/beacon_chain/src/builder.rs b/beacon_node/beacon_chain/src/builder.rs index 5564c7916f..750cde14ca 100644 --- a/beacon_node/beacon_chain/src/builder.rs +++ b/beacon_node/beacon_chain/src/builder.rs @@ -4,6 +4,7 @@ use crate::beacon_chain::{ BEACON_CHAIN_DB_KEY, CanonicalHead, LightClientProducerEvent, OP_POOL_DB_KEY, }; use crate::beacon_proposer_cache::BeaconProposerCache; +use crate::custody_context::NodeCustodyType; use crate::data_availability_checker::DataAvailabilityChecker; use crate::fork_choice_signal::ForkChoiceSignalTx; use crate::fork_revert::{reset_fork_choice_to_finalization, revert_to_fork_boundary}; @@ -100,7 +101,7 @@ pub struct BeaconChainBuilder { kzg: Arc, task_executor: Option, validator_monitor_config: Option, - import_all_data_columns: bool, + node_custody_type: NodeCustodyType, rng: Option>, } @@ -139,7 +140,7 @@ where kzg, task_executor: None, validator_monitor_config: None, - import_all_data_columns: false, + node_custody_type: NodeCustodyType::Fullnode, rng: None, } } @@ -640,9 +641,9 @@ where self } - /// Sets whether to require and import all data columns when importing block. - pub fn import_all_data_columns(mut self, import_all_data_columns: bool) -> Self { - self.import_all_data_columns = import_all_data_columns; + /// Sets the node custody type for data column import. + pub fn node_custody_type(mut self, node_custody_type: NodeCustodyType) -> Self { + self.node_custody_type = node_custody_type; self } @@ -935,10 +936,11 @@ where { Arc::new(CustodyContext::new_from_persisted_custody_context( custody, - self.import_all_data_columns, + self.node_custody_type, + &self.spec, )) } else { - Arc::new(CustodyContext::new(self.import_all_data_columns)) + Arc::new(CustodyContext::new(self.node_custody_type, &self.spec)) }; debug!(?custody_context, "Loading persisted custody context"); diff --git a/beacon_node/beacon_chain/src/chain_config.rs b/beacon_node/beacon_chain/src/chain_config.rs index a7defa9fa2..1f5abc4891 100644 --- a/beacon_node/beacon_chain/src/chain_config.rs +++ b/beacon_node/beacon_chain/src/chain_config.rs @@ -1,3 +1,4 @@ +use crate::custody_context::NodeCustodyType; pub use proto_array::{DisallowedReOrgOffsets, ReOrgThreshold}; use serde::{Deserialize, Serialize}; use std::str::FromStr; @@ -118,6 +119,8 @@ pub struct ChainConfig { pub invalid_block_roots: HashSet, /// Disable the getBlobs optimisation to fetch blobs from the EL mempool. pub disable_get_blobs: bool, + /// The node's custody type, determining how many data columns to custody and sample. + pub node_custody_type: NodeCustodyType, } impl Default for ChainConfig { @@ -158,6 +161,7 @@ impl Default for ChainConfig { data_column_publishing_delay: None, invalid_block_roots: HashSet::new(), disable_get_blobs: false, + node_custody_type: NodeCustodyType::Fullnode, } } } diff --git a/beacon_node/beacon_chain/src/validator_custody.rs b/beacon_node/beacon_chain/src/custody_context.rs similarity index 68% rename from beacon_node/beacon_chain/src/validator_custody.rs rename to beacon_node/beacon_chain/src/custody_context.rs index 3ab76828c9..7ec13a8b51 100644 --- a/beacon_node/beacon_chain/src/validator_custody.rs +++ b/beacon_node/beacon_chain/src/custody_context.rs @@ -1,4 +1,5 @@ use parking_lot::RwLock; +use serde::{Deserialize, Serialize}; use ssz_derive::{Decode, Encode}; use std::marker::PhantomData; use std::sync::OnceLock; @@ -6,11 +7,12 @@ use std::{ collections::{BTreeMap, HashMap}, sync::atomic::{AtomicU64, Ordering}, }; +use tracing::warn; use types::data_column_custody_group::{CustodyIndex, compute_columns_for_custody_group}; use types::{ChainSpec, ColumnIndex, Epoch, EthSpec, Slot}; /// A delay before making the CGC change effective to the data availability checker. -const CUSTODY_CHANGE_DA_EFFECTIVE_DELAY_SECONDS: u64 = 30; +pub const CUSTODY_CHANGE_DA_EFFECTIVE_DELAY_SECONDS: u64 = 30; /// Number of slots after which a validator's registration is removed if it has not re-registered. const VALIDATOR_REGISTRATION_EXPIRY_SLOTS: Slot = Slot::new(256); @@ -30,12 +32,36 @@ struct ValidatorRegistrations { /// /// Note: Only stores the epoch value when there's a change in custody requirement. /// So if epoch 10 and 11 has the same custody requirement, only 10 is stored. - /// This map is never pruned, because currently we never decrease custody requirement, so this - /// map size is contained at 128. + /// This map is only pruned during custody backfill. If epoch 11 has custody requirements + /// that are then backfilled to epoch 10, the value at epoch 11 will be removed and epoch 10 + /// will be added to the map instead. This should keep map size constrained to a maximum + /// value of 128. + /// + /// If the node's is started with a cgc override (i.e. supernode/semi-supernode flag), the cgc + /// value is inserted into this map on initialisation with epoch set to 0. For a semi-supernode, + /// this means the custody requirement can still be increased if validator custody exceeds + /// 64 columns. epoch_validator_custody_requirements: BTreeMap, } impl ValidatorRegistrations { + /// Initialise the validator registration with some default custody requirements. + /// + /// If a `cgc_override` value is specified, the cgc value is inserted into the registration map + /// and is equivalent to registering validator(s) with the same custody requirement. + fn new(cgc_override: Option) -> Self { + let mut registrations = ValidatorRegistrations { + validators: Default::default(), + epoch_validator_custody_requirements: Default::default(), + }; + if let Some(custody_count) = cgc_override { + registrations + .epoch_validator_custody_requirements + .insert(Epoch::new(0), custody_count); + } + registrations + } + /// Returns the validator custody requirement at the latest epoch. fn latest_validator_custody_requirement(&self) -> Option { self.epoch_validator_custody_requirements @@ -99,6 +125,25 @@ impl ValidatorRegistrations { None } } + + /// Updates the `epoch_validator_custody_requirements` map by pruning all values on/after `effective_epoch` + /// and updating the map to store the latest validator custody requirements for the `effective_epoch`. + pub fn backfill_validator_custody_requirements(&mut self, effective_epoch: Epoch) { + if let Some(latest_validator_custody) = self.latest_validator_custody_requirement() { + // Delete records if + // 1. The epoch is greater than or equal than `effective_epoch` + // 2. the cgc requirements match the latest validator custody requirements + self.epoch_validator_custody_requirements + .retain(|&epoch, custody_requirement| { + !(epoch >= effective_epoch && *custody_requirement == latest_validator_custody) + }); + + self.epoch_validator_custody_requirements + .entry(effective_epoch) + .and_modify(|old_custody| *old_custody = latest_validator_custody) + .or_insert(latest_validator_custody); + } + } } /// Given the `validator_custody_units`, return the custody requirement based on @@ -118,6 +163,51 @@ fn get_validators_custody_requirement(validator_custody_units: u64, spec: &Chain ) } +/// Indicates the different "modes" that a node can run based on the cli +/// parameters that are relevant for computing the custody count. +/// +/// The custody count is derived from 2 values: +/// 1. The number of validators attached to the node and the spec parameters +/// that attach custody weight to attached validators. +/// 2. The cli parameters that the current node is running with. +/// +/// We always persist the validator custody units to the db across restarts +/// such that we know the validator custody units at any given epoch in the past. +/// However, knowing the cli parameter at any given epoch is a pain to maintain +/// and unnecessary. +/// +/// Therefore, the custody count at any point in time is calculated as the max of +/// the validator custody at that time and the current cli params. +/// +/// Choosing the max ensures that we always have the minimum required columns and +/// we can adjust the `status.earliest_available_slot` value to indicate to our peers +/// the columns that we can guarantee to serve. +#[derive(Debug, Copy, Clone, PartialEq, Eq, Default, Deserialize, Serialize)] +pub enum NodeCustodyType { + /// The node is running with cli parameters to indicate that it + /// wants to subscribe to all columns. + Supernode, + /// The node is running with cli parameters to indicate that it + /// wants to subscribe to the minimum number of columns to enable + /// reconstruction (50%) of the full blob data on demand. + SemiSupernode, + /// The node isn't running with with any explicit cli parameters + /// or is running with cli parameters to indicate that it wants + /// to only subscribe to the minimal custody requirements. + #[default] + Fullnode, +} + +impl NodeCustodyType { + pub fn get_custody_count_override(&self, spec: &ChainSpec) -> Option { + match self { + Self::Fullnode => None, + Self::SemiSupernode => Some(spec.number_of_custody_groups / 2), + Self::Supernode => Some(spec.number_of_custody_groups), + } + } +} + /// Contains all the information the node requires to calculate the /// number of columns to be custodied when checking for DA. #[derive(Debug)] @@ -129,15 +219,6 @@ pub struct CustodyContext { /// we require for data availability check, and we use to advertise to our peers in the metadata /// and enr values. validator_custody_count: AtomicU64, - /// Is the node run as a supernode based on current cli parameters. - current_is_supernode: bool, - /// The persisted value for `is_supernode` based on the previous run of this node. - /// - /// Note: We require this value because if a user restarts the node with a higher cli custody - /// count value than in the previous run, then we should continue advertising the custody - /// count based on the old value than the new one since we haven't backfilled the required - /// columns. - persisted_is_supernode: bool, /// Maintains all the validators that this node is connected to currently validator_registrations: RwLock, /// Stores an immutable, ordered list of all custody columns as determined by the node's NodeID @@ -150,26 +231,45 @@ impl CustodyContext { /// Create a new custody default custody context object when no persisted object /// exists. /// - /// The `is_supernode` value is based on current cli parameters. - pub fn new(is_supernode: bool) -> Self { + /// The `node_custody_type` value is based on current cli parameters. + pub fn new(node_custody_type: NodeCustodyType, spec: &ChainSpec) -> Self { + let cgc_override = node_custody_type.get_custody_count_override(spec); + // If there's no override, we initialise `validator_custody_count` to 0. This has been the + // existing behaviour and we maintain this for now to avoid a semantic schema change until + // a later release. Self { - validator_custody_count: AtomicU64::new(0), - current_is_supernode: is_supernode, - persisted_is_supernode: is_supernode, - validator_registrations: Default::default(), + validator_custody_count: AtomicU64::new(cgc_override.unwrap_or(0)), + validator_registrations: RwLock::new(ValidatorRegistrations::new(cgc_override)), all_custody_columns_ordered: OnceLock::new(), _phantom_data: PhantomData, } } + /// Restore the custody context from disk. + /// + /// * If NodeCustodyType::custody_count < validator_custody_at_head, it means the attached + /// validate stake has increased the node's CGC. We ignore the CLI input. + /// * If NodeCustodyType::custody_count > validator_custody_at_head, it means the user has + /// changed the node's custody type via either the --supernode or --semi-supernode flags, + /// and will require a resync until we implement column backfill for this scenario. pub fn new_from_persisted_custody_context( ssz_context: CustodyContextSsz, - is_supernode: bool, + node_custody_type: NodeCustodyType, + spec: &ChainSpec, ) -> Self { + let cgc_override = node_custody_type.get_custody_count_override(spec); + if let Some(cgc_from_cli) = cgc_override + && cgc_from_cli > ssz_context.validator_custody_at_head + { + warn!( + info = "node will continue to run with the current custody count", + current_custody_count = ssz_context.validator_custody_at_head, + node_custody_type = ?node_custody_type, + "Changing node type is currently not supported without a resync and will have no effect", + ); + } CustodyContext { validator_custody_count: AtomicU64::new(ssz_context.validator_custody_at_head), - current_is_supernode: is_supernode, - persisted_is_supernode: ssz_context.persisted_is_supernode, validator_registrations: RwLock::new(ValidatorRegistrations { validators: Default::default(), epoch_validator_custody_requirements: ssz_context @@ -228,12 +328,11 @@ impl CustodyContext { return None; }; - let current_cgc = self.custody_group_count_at_head(spec); - let validator_custody_count_at_head = self.validator_custody_count.load(Ordering::Relaxed); + let current_cgc = self.validator_custody_count.load(Ordering::Relaxed); - if new_validator_custody != validator_custody_count_at_head { + if new_validator_custody != current_cgc { tracing::debug!( - old_count = validator_custody_count_at_head, + old_count = current_cgc, new_count = new_validator_custody, "Validator count at head updated" ); @@ -250,6 +349,7 @@ impl CustodyContext { ); return Some(CustodyCountChanged { new_custody_group_count: updated_cgc, + old_custody_group_count: current_cgc, sampling_count: self.num_of_custody_groups_to_sample(effective_epoch, spec), effective_epoch, }); @@ -263,9 +363,6 @@ impl CustodyContext { /// Do NOT use this directly for data availability check, use `self.sampling_size` instead as /// CGC can change over epochs. pub fn custody_group_count_at_head(&self, spec: &ChainSpec) -> u64 { - if self.current_is_supernode { - return spec.number_of_custody_groups; - } let validator_custody_count_at_head = self.validator_custody_count.load(Ordering::Relaxed); // If there are no validators, return the minimum custody_requirement @@ -282,15 +379,11 @@ impl CustodyContext { /// minimum sampling size which may exceed the custody group count (CGC). /// /// See also: [`Self::num_of_custody_groups_to_sample`]. - fn custody_group_count_at_epoch(&self, epoch: Epoch, spec: &ChainSpec) -> u64 { - if self.current_is_supernode { - spec.number_of_custody_groups - } else { - self.validator_registrations - .read() - .custody_requirement_at_epoch(epoch) - .unwrap_or(spec.custody_requirement) - } + pub fn custody_group_count_at_epoch(&self, epoch: Epoch, spec: &ChainSpec) -> u64 { + self.validator_registrations + .read() + .custody_requirement_at_epoch(epoch) + .unwrap_or(spec.custody_requirement) } /// Returns the count of custody groups this node must _sample_ for a block at `epoch` to import. @@ -360,14 +453,22 @@ impl CustodyContext { .all_custody_columns_ordered .get() .expect("all_custody_columns_ordered should be initialized"); + &all_columns_ordered[..custody_group_count] } + + pub fn update_and_backfill_custody_count_at_epoch(&self, effective_epoch: Epoch) { + self.validator_registrations + .write() + .backfill_validator_custody_requirements(effective_epoch); + } } /// The custody count changed because of a change in the /// number of validators being managed. pub struct CustodyCountChanged { pub new_custody_group_count: u64, + pub old_custody_group_count: u64, pub sampling_count: u64, pub effective_epoch: Epoch, } @@ -376,6 +477,7 @@ pub struct CustodyCountChanged { #[derive(Debug, Encode, Decode, Clone)] pub struct CustodyContextSsz { pub validator_custody_at_head: u64, + /// DEPRECATED. This field is no longer in used and will be removed in a future release. pub persisted_is_supernode: bool, pub epoch_validator_custody_requirements: Vec<(Epoch, u64)>, } @@ -384,7 +486,8 @@ impl From<&CustodyContext> for CustodyContextSsz { fn from(context: &CustodyContext) -> Self { CustodyContextSsz { validator_custody_at_head: context.validator_custody_count.load(Ordering::Relaxed), - persisted_is_supernode: context.persisted_is_supernode, + // This field is deprecated and has no effect + persisted_is_supernode: false, epoch_validator_custody_requirements: context .validator_registrations .read() @@ -408,8 +511,8 @@ mod tests { #[test] fn no_validators_supernode_default() { - let custody_context = CustodyContext::::new(true); let spec = E::default_spec(); + let custody_context = CustodyContext::::new(NodeCustodyType::Supernode, &spec); assert_eq!( custody_context.custody_group_count_at_head(&spec), spec.number_of_custody_groups @@ -421,9 +524,23 @@ mod tests { } #[test] - fn no_validators_fullnode_default() { - let custody_context = CustodyContext::::new(false); + fn no_validators_semi_supernode_default() { let spec = E::default_spec(); + let custody_context = CustodyContext::::new(NodeCustodyType::SemiSupernode, &spec); + assert_eq!( + custody_context.custody_group_count_at_head(&spec), + spec.number_of_custody_groups / 2 + ); + assert_eq!( + custody_context.num_of_custody_groups_to_sample(Epoch::new(0), &spec), + spec.number_of_custody_groups / 2 + ); + } + + #[test] + fn no_validators_fullnode_default() { + let spec = E::default_spec(); + let custody_context = CustodyContext::::new(NodeCustodyType::Fullnode, &spec); assert_eq!( custody_context.custody_group_count_at_head(&spec), spec.custody_requirement, @@ -437,8 +554,8 @@ mod tests { #[test] fn register_single_validator_should_update_cgc() { - let custody_context = CustodyContext::::new(false); let spec = E::default_spec(); + let custody_context = CustodyContext::::new(NodeCustodyType::Fullnode, &spec); let bal_per_additional_group = spec.balance_per_additional_custody_group; let min_val_custody_requirement = spec.validator_custody_requirement; // One single node increases its balance over 3 epochs. @@ -461,8 +578,8 @@ mod tests { #[test] fn register_multiple_validators_should_update_cgc() { - let custody_context = CustodyContext::::new(false); let spec = E::default_spec(); + let custody_context = CustodyContext::::new(NodeCustodyType::Fullnode, &spec); let bal_per_additional_group = spec.balance_per_additional_custody_group; let min_val_custody_requirement = spec.validator_custody_requirement; // Add 3 validators over 3 epochs. @@ -498,8 +615,8 @@ mod tests { #[test] fn register_validators_should_not_update_cgc_for_supernode() { - let custody_context = CustodyContext::::new(true); let spec = E::default_spec(); + let custody_context = CustodyContext::::new(NodeCustodyType::Supernode, &spec); let bal_per_additional_group = spec.balance_per_additional_custody_group; // Add 3 validators over 3 epochs. @@ -536,8 +653,8 @@ mod tests { #[test] fn cgc_change_should_be_effective_to_sampling_after_delay() { - let custody_context = CustodyContext::::new(false); let spec = E::default_spec(); + let custody_context = CustodyContext::::new(NodeCustodyType::Fullnode, &spec); let current_slot = Slot::new(10); let current_epoch = current_slot.epoch(E::slots_per_epoch()); let default_sampling_size = @@ -567,8 +684,8 @@ mod tests { #[test] fn validator_dropped_after_no_registrations_within_expiry_should_not_reduce_cgc() { - let custody_context = CustodyContext::::new(false); let spec = E::default_spec(); + let custody_context = CustodyContext::::new(NodeCustodyType::Fullnode, &spec); let current_slot = Slot::new(10); let val_custody_units_1 = 10; let val_custody_units_2 = 5; @@ -609,8 +726,8 @@ mod tests { #[test] fn validator_dropped_after_no_registrations_within_expiry() { - let custody_context = CustodyContext::::new(false); let spec = E::default_spec(); + let custody_context = CustodyContext::::new(NodeCustodyType::Fullnode, &spec); let current_slot = Slot::new(10); let val_custody_units_1 = 10; let val_custody_units_2 = 5; @@ -660,7 +777,7 @@ mod tests { #[test] fn should_init_ordered_data_columns_and_return_sampling_columns() { let spec = E::default_spec(); - let custody_context = CustodyContext::::new(false); + let custody_context = CustodyContext::::new(NodeCustodyType::Fullnode, &spec); let sampling_size = custody_context.num_of_data_columns_to_sample(Epoch::new(0), &spec); // initialise ordered columns @@ -712,8 +829,8 @@ mod tests { #[test] fn custody_columns_for_epoch_no_validators_fullnode() { - let custody_context = CustodyContext::::new(false); let spec = E::default_spec(); + let custody_context = CustodyContext::::new(NodeCustodyType::Fullnode, &spec); let all_custody_groups_ordered = (0..spec.number_of_custody_groups).collect::>(); custody_context @@ -728,8 +845,8 @@ mod tests { #[test] fn custody_columns_for_epoch_no_validators_supernode() { - let custody_context = CustodyContext::::new(true); let spec = E::default_spec(); + let custody_context = CustodyContext::::new(NodeCustodyType::Supernode, &spec); let all_custody_groups_ordered = (0..spec.number_of_custody_groups).collect::>(); custody_context @@ -744,8 +861,8 @@ mod tests { #[test] fn custody_columns_for_epoch_with_validators_should_match_cgc() { - let custody_context = CustodyContext::::new(false); let spec = E::default_spec(); + let custody_context = CustodyContext::::new(NodeCustodyType::Fullnode, &spec); let all_custody_groups_ordered = (0..spec.number_of_custody_groups).collect::>(); let val_custody_units = 10; @@ -770,8 +887,8 @@ mod tests { #[test] fn custody_columns_for_epoch_specific_epoch_uses_epoch_cgc() { - let custody_context = CustodyContext::::new(false); let spec = E::default_spec(); + let custody_context = CustodyContext::::new(NodeCustodyType::Fullnode, &spec); let all_custody_groups_ordered = (0..spec.number_of_custody_groups).collect::>(); let test_epoch = Epoch::new(5); @@ -787,4 +904,133 @@ mod tests { expected_cgc as usize ); } + + #[test] + fn restore_from_persisted_fullnode_no_validators() { + let spec = E::default_spec(); + let ssz_context = CustodyContextSsz { + validator_custody_at_head: 0, // no validators + persisted_is_supernode: false, + epoch_validator_custody_requirements: vec![], + }; + + let custody_context = CustodyContext::::new_from_persisted_custody_context( + ssz_context, + NodeCustodyType::Fullnode, + &spec, + ); + + assert_eq!( + custody_context.custody_group_count_at_head(&spec), + spec.custody_requirement, + "restored custody group count should match fullnode default" + ); + } + + #[test] + fn restore_fullnode_then_switch_to_supernode_has_no_effect() { + let spec = E::default_spec(); + let ssz_context = CustodyContextSsz { + validator_custody_at_head: 0, // no validators + persisted_is_supernode: false, + epoch_validator_custody_requirements: vec![], + }; + + // Attempt to restore as supernode (wants 128), but should use original persisted value + let custody_context = CustodyContext::::new_from_persisted_custody_context( + ssz_context, + NodeCustodyType::Supernode, + &spec, + ); + + assert_eq!( + custody_context.custody_group_count_at_head(&spec), + spec.custody_requirement, + "should use original fullnode cgc, not supernode cgc" + ); + } + + #[test] + fn restore_supernode_then_switch_to_fullnode_uses_persisted() { + let spec = E::default_spec(); + let supernode_cgc = spec.number_of_custody_groups; // supernode cgc + + let ssz_context = CustodyContextSsz { + validator_custody_at_head: supernode_cgc, + persisted_is_supernode: false, + epoch_validator_custody_requirements: vec![(Epoch::new(0), supernode_cgc)], + }; + + // Attempt to restore as fullnode (wants 8), but should keep persisted value (128) + let custody_context = CustodyContext::::new_from_persisted_custody_context( + ssz_context, + NodeCustodyType::Fullnode, + &spec, + ); + + assert_eq!( + custody_context.custody_group_count_at_head(&spec), + supernode_cgc, + "should use persisted supernode cgc, not fullnode cgc" + ); + } + + #[test] + fn restore_with_validator_custody_history_across_epochs() { + let spec = E::default_spec(); + let initial_cgc = 8u64; + let increased_cgc = 16u64; + let final_cgc = 32u64; + + let ssz_context = CustodyContextSsz { + validator_custody_at_head: final_cgc, + persisted_is_supernode: false, + epoch_validator_custody_requirements: vec![ + (Epoch::new(0), initial_cgc), + (Epoch::new(10), increased_cgc), + (Epoch::new(20), final_cgc), + ], + }; + + let custody_context = CustodyContext::::new_from_persisted_custody_context( + ssz_context, + NodeCustodyType::Fullnode, + &spec, + ); + + // Verify head uses latest value + assert_eq!( + custody_context.custody_group_count_at_head(&spec), + final_cgc + ); + + // Verify historical epoch lookups work correctly + assert_eq!( + custody_context.custody_group_count_at_epoch(Epoch::new(5), &spec), + initial_cgc, + "epoch 5 should use initial cgc" + ); + assert_eq!( + custody_context.custody_group_count_at_epoch(Epoch::new(15), &spec), + increased_cgc, + "epoch 15 should use increased cgc" + ); + assert_eq!( + custody_context.custody_group_count_at_epoch(Epoch::new(25), &spec), + final_cgc, + "epoch 25 should use final cgc" + ); + + // Verify sampling size calculation uses correct historical values + assert_eq!( + custody_context.num_of_custody_groups_to_sample(Epoch::new(5), &spec), + spec.samples_per_slot, + "sampling at epoch 5 should use spec minimum since cgc is at minimum" + ); + assert_eq!( + custody_context.num_of_custody_groups_to_sample(Epoch::new(25), &spec), + final_cgc, + "sampling at epoch 25 should match final cgc" + ); + } } diff --git a/beacon_node/beacon_chain/src/data_availability_checker.rs b/beacon_node/beacon_chain/src/data_availability_checker.rs index 43b7d8f7ea..d6cc8d8947 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker.rs @@ -644,8 +644,17 @@ impl DataAvailabilityChecker { "Reconstructed columns" ); + let columns_to_sample = self + .custody_context() + .sampling_columns_for_epoch(slot.epoch(T::EthSpec::slots_per_epoch()), &self.spec); + let data_columns_to_import: Vec<_> = data_columns_to_publish + .iter() + .filter(|column| columns_to_sample.contains(&column.index())) + .cloned() + .collect(); + self.availability_cache - .put_kzg_verified_data_columns(*block_root, data_columns_to_publish.clone()) + .put_kzg_verified_data_columns(*block_root, data_columns_to_import) .map(|availability| { DataColumnReconstructionResult::Success(( availability, @@ -859,6 +868,7 @@ impl MaybeAvailableBlock { mod test { use super::*; use crate::CustodyContext; + use crate::custody_context::NodeCustodyType; use crate::test_utils::{ EphemeralHarnessType, NumBlobs, generate_rand_block_and_data_columns, get_kzg, }; @@ -1082,6 +1092,95 @@ mod test { verification_result.expect_err("should have failed to verify blocks"); } + #[test] + fn should_exclude_reconstructed_columns_not_required_for_sampling() { + // SETUP + let spec = Arc::new(ForkName::Fulu.make_genesis_spec(E::default_spec())); + let mut rng = StdRng::seed_from_u64(0xDEADBEEF0BAD5EEDu64); + + let da_checker = new_da_checker(spec.clone()); + let custody_context = &da_checker.custody_context; + let all_column_indices_ordered = + init_custody_context_with_ordered_columns(custody_context, &mut rng, &spec); + + // Set custody requirement to 65 columns (enough to trigger reconstruction) + let epoch = Epoch::new(1); + custody_context.register_validators( + vec![(0, 2_048_000_000_000), (1, 32_000_000_000)], // 64 + 1 + Slot::new(0), + &spec, + ); + let sampling_requirement = custody_context.num_of_data_columns_to_sample(epoch, &spec); + assert_eq!( + sampling_requirement, 65, + "sampling requirement should be 65" + ); + + let (block, data_columns) = generate_rand_block_and_data_columns::( + ForkName::Fulu, + NumBlobs::Number(1), + &mut rng, + &spec, + ); + let block_root = Hash256::random(); + // Add the block to the DA checker + da_checker + .availability_cache + .put_pre_execution_block(block_root, Arc::new(block), BlockImportSource::Gossip) + .expect("should put block"); + + // Add 64 columns to the da checker (enough to be able to reconstruct) + // Order by all_column_indices_ordered, then take first 64 + let custody_columns = all_column_indices_ordered + .iter() + .filter_map(|&col_idx| data_columns.iter().find(|d| d.index == col_idx).cloned()) + .take(64) + .map(|d| { + KzgVerifiedCustodyDataColumn::from_asserted_custody( + KzgVerifiedDataColumn::__new_for_testing(d), + ) + }) + .collect::>(); + + da_checker + .availability_cache + .put_kzg_verified_data_columns(block_root, custody_columns) + .expect("should put custody columns"); + + // Try reconstrucing + let reconstruction_result = da_checker + .reconstruct_data_columns(&block_root) + .expect("should reconstruct columns"); + + // Reconstruction should succeed + let (_availability, reconstructed_columns) = match reconstruction_result { + DataColumnReconstructionResult::Success(result) => result, + e => { + panic!("Expected successful reconstruction {:?}", e); + } + }; + + // Remaining 64 columns should be reconstructed + assert_eq!( + reconstructed_columns.len(), + 64, + "should reconstruct the remaining 64 columns" + ); + + // Only the columns required for custody (65) should be imported into the cache + let sampling_columns = custody_context.sampling_columns_for_epoch(epoch, &spec); + let actual_cached: HashSet = da_checker + .cached_data_column_indexes(&block_root) + .expect("should have cached data columns") + .into_iter() + .collect(); + let expected_sampling_columns = sampling_columns.iter().copied().collect::>(); + assert_eq!( + actual_cached, expected_sampling_columns, + "should cache only the required custody columns, not all reconstructed columns" + ); + } + fn init_custody_context_with_ordered_columns( custody_context: &Arc>, mut rng: &mut StdRng, @@ -1103,7 +1202,7 @@ mod test { ); let kzg = get_kzg(&spec); let store = Arc::new(HotColdDB::open_ephemeral(<_>::default(), spec.clone()).unwrap()); - let custody_context = Arc::new(CustodyContext::new(false)); + let custody_context = Arc::new(CustodyContext::new(NodeCustodyType::Fullnode, &spec)); let complete_blob_backfill = false; DataAvailabilityChecker::new( complete_blob_backfill, diff --git a/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs b/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs index 42f6dbd856..b842a1a3f9 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs @@ -827,6 +827,7 @@ mod test { blob_verification::GossipVerifiedBlob, block_verification::PayloadVerificationOutcome, block_verification_types::{AsBlock, BlockImportData}, + custody_context::NodeCustodyType, data_availability_checker::STATE_LRU_CAPACITY, test_utils::{BaseHarnessType, BeaconChainHarness, DiskHarnessType}, }; @@ -1021,7 +1022,7 @@ mod test { let spec = harness.spec.clone(); let test_store = harness.chain.store.clone(); let capacity_non_zero = new_non_zero_usize(capacity); - let custody_context = Arc::new(CustodyContext::new(false)); + let custody_context = Arc::new(CustodyContext::new(NodeCustodyType::Fullnode, &spec)); let cache = Arc::new( DataAvailabilityCheckerInner::::new( capacity_non_zero, diff --git a/beacon_node/beacon_chain/src/data_column_verification.rs b/beacon_node/beacon_chain/src/data_column_verification.rs index 01e79c49aa..07f85b045a 100644 --- a/beacon_node/beacon_chain/src/data_column_verification.rs +++ b/beacon_node/beacon_chain/src/data_column_verification.rs @@ -626,21 +626,22 @@ fn verify_parent_block_and_finalized_descendant( chain: &BeaconChain, ) -> Result { let fork_choice = chain.canonical_head.fork_choice_read_lock(); - let block_parent_root = data_column.block_parent_root(); - - // Do not process a column that does not descend from the finalized root. - if !fork_choice.is_finalized_checkpoint_or_descendant(block_parent_root) { - return Err(GossipDataColumnError::NotFinalizedDescendant { block_parent_root }); - } // We have already verified that the column is past finalization, so we can // just check fork choice for the block's parent. + let block_parent_root = data_column.block_parent_root(); let Some(parent_block) = fork_choice.get_block(&block_parent_root) else { return Err(GossipDataColumnError::ParentUnknown { parent_root: block_parent_root, }); }; + // Do not process a column that does not descend from the finalized root. + // We just loaded the parent_block, so we can be sure that it exists in fork choice. + if !fork_choice.is_finalized_checkpoint_or_descendant(block_parent_root) { + return Err(GossipDataColumnError::NotFinalizedDescendant { block_parent_root }); + } + Ok(parent_block) } diff --git a/beacon_node/beacon_chain/src/errors.rs b/beacon_node/beacon_chain/src/errors.rs index 7b04a36fae..d4eba2b0ea 100644 --- a/beacon_node/beacon_chain/src/errors.rs +++ b/beacon_node/beacon_chain/src/errors.rs @@ -247,6 +247,7 @@ pub enum BeaconChainError { cache_epoch: Epoch, }, SkipProposerPreparation, + FailedColumnCustodyInfoUpdate, } easy_from_to!(SlotProcessingError, BeaconChainError); diff --git a/beacon_node/beacon_chain/src/historical_data_columns.rs b/beacon_node/beacon_chain/src/historical_data_columns.rs new file mode 100644 index 0000000000..7e196eb75e --- /dev/null +++ b/beacon_node/beacon_chain/src/historical_data_columns.rs @@ -0,0 +1,151 @@ +use std::collections::{HashMap, HashSet}; + +use crate::{ + BeaconChain, BeaconChainError, BeaconChainTypes, + data_column_verification::verify_kzg_for_data_column_list, +}; +use store::{Error as StoreError, KeyValueStore}; +use tracing::{Span, debug, instrument}; +use types::{ColumnIndex, DataColumnSidecarList, Epoch, EthSpec, Hash256, Slot}; + +#[derive(Debug)] +pub enum HistoricalDataColumnError { + // The provided data column sidecar pertains to a block that doesn't exist in the database. + NoBlockFound { + data_column_block_root: Hash256, + expected_block_root: Hash256, + }, + + /// Logic error: should never occur. + IndexOutOfBounds, + + /// The provided data column sidecar list doesn't contain columns for the full range of slots for the given epoch. + MissingDataColumns { + missing_slots_and_data_columns: Vec<(Slot, ColumnIndex)>, + }, + + /// The provided data column sidecar list contains at least one column with an invalid kzg commitment. + InvalidKzg, + + /// Internal store error + StoreError(StoreError), + + /// Internal beacon chain error + BeaconChainError(Box), +} + +impl From for HistoricalDataColumnError { + fn from(e: StoreError) -> Self { + Self::StoreError(e) + } +} + +impl BeaconChain { + /// Store a batch of historical data columns in the database. + /// + /// The data columns block roots and proposer signatures are verified with the existing + /// block stored in the DB. This function also verifies the columns KZG committments. + /// + /// This function requires that the data column sidecar list contains columns for a full epoch. + /// + /// Return the number of `data_columns` successfully imported. + #[instrument(skip_all, fields(columns_imported_count = tracing::field::Empty ))] + pub fn import_historical_data_column_batch( + &self, + epoch: Epoch, + historical_data_column_sidecar_list: DataColumnSidecarList, + ) -> Result { + let mut total_imported = 0; + let mut ops = vec![]; + + let unique_column_indices = historical_data_column_sidecar_list + .iter() + .map(|item| item.index) + .collect::>(); + + let mut slot_and_column_index_to_data_columns = historical_data_column_sidecar_list + .iter() + .map(|data_column| ((data_column.slot(), data_column.index), data_column)) + .collect::>(); + + let forward_blocks_iter = self + .forwards_iter_block_roots_until( + epoch.start_slot(T::EthSpec::slots_per_epoch()), + epoch.end_slot(T::EthSpec::slots_per_epoch()), + ) + .map_err(|e| HistoricalDataColumnError::BeaconChainError(Box::new(e)))?; + + for block_iter_result in forward_blocks_iter { + let (block_root, slot) = block_iter_result + .map_err(|e| HistoricalDataColumnError::BeaconChainError(Box::new(e)))?; + + for column_index in unique_column_indices.clone() { + if let Some(data_column) = + slot_and_column_index_to_data_columns.remove(&(slot, column_index)) + { + if self + .store + .get_data_column(&block_root, &data_column.index)? + .is_some() + { + debug!( + block_root = ?block_root, + column_index = data_column.index, + "Skipping data column import as identical data column exists" + ); + continue; + } + if block_root != data_column.block_root() { + return Err(HistoricalDataColumnError::NoBlockFound { + data_column_block_root: data_column.block_root(), + expected_block_root: block_root, + }); + } + self.store.data_column_as_kv_store_ops( + &block_root, + data_column.clone(), + &mut ops, + ); + total_imported += 1; + } + } + } + + // If we've made it to here with no columns to import, this means there are no blobs for this epoch. + // `RangeDataColumnBatchRequest` logic should have caught any bad peers withholding columns + if historical_data_column_sidecar_list.is_empty() { + if !ops.is_empty() { + // This shouldn't be a valid case. If there are no columns to import, + // there should be no generated db operations. + return Err(HistoricalDataColumnError::IndexOutOfBounds); + } + } else { + verify_kzg_for_data_column_list(historical_data_column_sidecar_list.iter(), &self.kzg) + .map_err(|_| HistoricalDataColumnError::InvalidKzg)?; + + self.store.blobs_db.do_atomically(ops)?; + } + + if !slot_and_column_index_to_data_columns.is_empty() { + debug!( + ?epoch, + extra_data = ?slot_and_column_index_to_data_columns.keys().map(|(slot, _)| slot), + "We've received unexpected extra data columns, these will not be imported" + ); + } + + self.data_availability_checker + .custody_context() + .update_and_backfill_custody_count_at_epoch(epoch); + + self.safely_backfill_data_column_custody_info(epoch) + .map_err(|e| HistoricalDataColumnError::BeaconChainError(Box::new(e)))?; + + debug!(?epoch, total_imported, "Imported historical data columns"); + + let current_span = Span::current(); + current_span.record("columns_imported_count", total_imported); + + Ok(total_imported) + } +} diff --git a/beacon_node/beacon_chain/src/lib.rs b/beacon_node/beacon_chain/src/lib.rs index 9d8c3dba38..4ac3e54742 100644 --- a/beacon_node/beacon_chain/src/lib.rs +++ b/beacon_node/beacon_chain/src/lib.rs @@ -17,6 +17,7 @@ pub mod block_verification_types; pub mod builder; pub mod canonical_head; pub mod chain_config; +pub mod custody_context; pub mod data_availability_checker; pub mod data_column_verification; mod early_attester_cache; @@ -28,6 +29,7 @@ pub mod fork_choice_signal; pub mod fork_revert; pub mod graffiti_calculator; pub mod historical_blocks; +pub mod historical_data_columns; pub mod kzg_utils; pub mod light_client_finality_update_verification; pub mod light_client_optimistic_update_verification; @@ -54,7 +56,6 @@ pub mod summaries_dag; pub mod sync_committee_rewards; pub mod sync_committee_verification; pub mod test_utils; -pub mod validator_custody; pub mod validator_monitor; pub mod validator_pubkey_cache; @@ -83,6 +84,7 @@ pub use block_verification::{ pub use block_verification_types::AvailabilityPendingExecutedBlock; pub use block_verification_types::ExecutedBlock; pub use canonical_head::{CachedHead, CanonicalHead, CanonicalHeadRwLock}; +pub use custody_context::CustodyContext; pub use events::ServerSentEventHandler; pub use execution_layer::EngineState; pub use execution_payload::NotifyExecutionLayer; @@ -98,4 +100,3 @@ pub use state_processing::per_block_processing::errors::{ }; pub use store; pub use types; -pub use validator_custody::CustodyContext; diff --git a/beacon_node/beacon_chain/src/persisted_custody.rs b/beacon_node/beacon_chain/src/persisted_custody.rs index b685ea36b7..ba221c67b5 100644 --- a/beacon_node/beacon_chain/src/persisted_custody.rs +++ b/beacon_node/beacon_chain/src/persisted_custody.rs @@ -1,4 +1,4 @@ -use crate::validator_custody::CustodyContextSsz; +use crate::custody_context::CustodyContextSsz; use ssz::{Decode, Encode}; use std::sync::Arc; use store::{DBColumn, Error as StoreError, HotColdDB, ItemStore, StoreItem}; diff --git a/beacon_node/beacon_chain/src/schema_change/migration_schema_v26.rs b/beacon_node/beacon_chain/src/schema_change/migration_schema_v26.rs index 661d015942..38714ea060 100644 --- a/beacon_node/beacon_chain/src/schema_change/migration_schema_v26.rs +++ b/beacon_node/beacon_chain/src/schema_change/migration_schema_v26.rs @@ -1,6 +1,6 @@ use crate::BeaconChainTypes; +use crate::custody_context::CustodyContextSsz; use crate::persisted_custody::{CUSTODY_DB_KEY, PersistedCustody}; -use crate::validator_custody::CustodyContextSsz; use ssz::{Decode, Encode}; use ssz_derive::{Decode, Encode}; use std::sync::Arc; diff --git a/beacon_node/beacon_chain/src/state_advance_timer.rs b/beacon_node/beacon_chain/src/state_advance_timer.rs index 87348cb01b..b10edf2336 100644 --- a/beacon_node/beacon_chain/src/state_advance_timer.rs +++ b/beacon_node/beacon_chain/src/state_advance_timer.rs @@ -333,25 +333,54 @@ fn advance_head(beacon_chain: &Arc>) -> Resu .build_committee_cache(RelativeEpoch::Next, &beacon_chain.spec) .map_err(BeaconChainError::from)?; - // If the `pre_state` is in a later epoch than `state`, pre-emptively add the proposer shuffling - // for the state's current epoch and the committee cache for the state's next epoch. + // The state root is required to prime the proposer cache AND for writing it to disk. + let advanced_state_root = state.update_tree_hash_cache()?; + + // If the `pre_state` is in a later epoch than `state`, pre-emptively update the proposer + // shuffling and attester shuffling caches. if initial_epoch < state.current_epoch() { - // Update the proposer cache. - // - // We supply the `head_block_root` as the decision block since the prior `if` statement guarantees - // the head root is the latest block from the prior epoch. - beacon_chain - .beacon_proposer_cache - .lock() - .insert( - state.current_epoch(), - head_block_root, - state - .get_beacon_proposer_indices(state.current_epoch(), &beacon_chain.spec) - .map_err(BeaconChainError::from)?, - state.fork(), - ) - .map_err(BeaconChainError::from)?; + // Include the proposer shuffling from the current epoch, which is likely to be useful + // pre-Fulu, and probably redundant post-Fulu (it should already have been in the cache). + let current_epoch_decision_root = state.proposer_shuffling_decision_root_at_epoch( + state.current_epoch(), + head_block_root, + &beacon_chain.spec, + )?; + beacon_chain.with_proposer_cache( + current_epoch_decision_root, + state.current_epoch(), + |_| Ok(()), + || { + debug!( + shuffling_decision_root = ?current_epoch_decision_root, + epoch = %state.current_epoch(), + "Computing current epoch proposer shuffling in state advance" + ); + Ok::<_, Error>((advanced_state_root, state.clone())) + }, + )?; + + // For epochs *greater than* the Fulu fork epoch, we have also determined the proposer + // shuffling for the next epoch. + let next_epoch = state.next_epoch()?; + let next_epoch_decision_root = state.proposer_shuffling_decision_root_at_epoch( + next_epoch, + head_block_root, + &beacon_chain.spec, + )?; + beacon_chain.with_proposer_cache( + next_epoch_decision_root, + next_epoch, + |_| Ok(()), + || { + debug!( + shuffling_decision_root = ?next_epoch_decision_root, + epoch = %next_epoch, + "Computing next epoch proposer shuffling in state advance" + ); + Ok::<_, Error>((advanced_state_root, state.clone())) + }, + )?; // Update the attester cache. let shuffling_id = @@ -406,7 +435,6 @@ fn advance_head(beacon_chain: &Arc>) -> Resu // even if we race with the deletion of this state by the finalization pruning code, the worst // case is we end up with a finalized state stored, that will get pruned the next time pruning // runs. - let advanced_state_root = state.update_tree_hash_cache()?; beacon_chain.store.put_state(&advanced_state_root, &state)?; debug!( diff --git a/beacon_node/beacon_chain/src/test_utils.rs b/beacon_node/beacon_chain/src/test_utils.rs index bfd0484d91..623cc01d46 100644 --- a/beacon_node/beacon_chain/src/test_utils.rs +++ b/beacon_node/beacon_chain/src/test_utils.rs @@ -1,5 +1,6 @@ use crate::blob_verification::GossipVerifiedBlob; use crate::block_verification_types::{AsBlock, RpcBlock}; +use crate::custody_context::NodeCustodyType; use crate::data_column_verification::CustodyDataColumn; use crate::kzg_utils::build_data_column_sidecars; use crate::observed_operations::ObservationOutcome; @@ -210,7 +211,7 @@ pub struct Builder { testing_slot_clock: Option, validator_monitor_config: Option, genesis_state_builder: Option>, - import_all_data_columns: bool, + node_custody_type: NodeCustodyType, runtime: TestRuntime, } @@ -356,7 +357,7 @@ where testing_slot_clock: None, validator_monitor_config: None, genesis_state_builder: None, - import_all_data_columns: false, + node_custody_type: NodeCustodyType::Fullnode, runtime, } } @@ -442,8 +443,8 @@ where self } - pub fn import_all_data_columns(mut self, import_all_data_columns: bool) -> Self { - self.import_all_data_columns = import_all_data_columns; + pub fn node_custody_type(mut self, node_custody_type: NodeCustodyType) -> Self { + self.node_custody_type = node_custody_type; self } @@ -565,7 +566,7 @@ where .execution_layer(self.execution_layer) .shutdown_sender(shutdown_tx) .chain_config(chain_config) - .import_all_data_columns(self.import_all_data_columns) + .node_custody_type(self.node_custody_type) .event_handler(Some(ServerSentEventHandler::new_with_capacity(5))) .validator_monitor_config(validator_monitor_config) .rng(Box::new(StdRng::seed_from_u64(42))); @@ -2437,7 +2438,7 @@ where } /// Builds an `RpcBlock` from a `SignedBeaconBlock` and `BlobsList`. - fn build_rpc_block_from_blobs( + pub fn build_rpc_block_from_blobs( &self, block_root: Hash256, block: Arc>>, diff --git a/beacon_node/beacon_chain/tests/blob_verification.rs b/beacon_node/beacon_chain/tests/blob_verification.rs new file mode 100644 index 0000000000..c42a2828c0 --- /dev/null +++ b/beacon_node/beacon_chain/tests/blob_verification.rs @@ -0,0 +1,120 @@ +#![cfg(not(debug_assertions))] + +use beacon_chain::test_utils::{ + AttestationStrategy, BeaconChainHarness, BlockStrategy, EphemeralHarnessType, test_spec, +}; +use beacon_chain::{ + AvailabilityProcessingStatus, BlockError, ChainConfig, InvalidSignature, NotifyExecutionLayer, + block_verification_types::AsBlock, +}; +use logging::create_test_tracing_subscriber; +use std::sync::{Arc, LazyLock}; +use types::{blob_sidecar::FixedBlobSidecarList, *}; + +type E = MainnetEthSpec; + +// Should ideally be divisible by 3. +const VALIDATOR_COUNT: usize = 24; + +/// A cached set of keys. +static KEYPAIRS: LazyLock> = + LazyLock::new(|| types::test_utils::generate_deterministic_keypairs(VALIDATOR_COUNT)); + +fn get_harness( + validator_count: usize, + spec: Arc, +) -> BeaconChainHarness> { + create_test_tracing_subscriber(); + let harness = BeaconChainHarness::builder(MainnetEthSpec) + .spec(spec) + .chain_config(ChainConfig { + reconstruct_historic_states: true, + ..ChainConfig::default() + }) + .keypairs(KEYPAIRS[0..validator_count].to_vec()) + .fresh_ephemeral_store() + .mock_execution_layer() + .build(); + + harness.advance_slot(); + + harness +} + +// Regression test for https://github.com/sigp/lighthouse/issues/7650 +#[tokio::test] +async fn rpc_blobs_with_invalid_header_signature() { + let spec = Arc::new(test_spec::()); + + // Only run this test if blobs are enabled and columns are disabled. + if spec.deneb_fork_epoch.is_none() || spec.is_fulu_scheduled() { + return; + } + + let harness = get_harness(VALIDATOR_COUNT, spec); + + let num_blocks = E::slots_per_epoch() as usize; + + // Add some chain depth. + harness + .extend_chain( + num_blocks, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + // Produce a block with blobs. + harness.execution_block_generator().set_min_blob_count(1); + let head_state = harness.get_current_state(); + let slot = head_state.slot() + 1; + let ((signed_block, opt_blobs), _) = harness.make_block(head_state, slot).await; + let (kzg_proofs, blobs) = opt_blobs.unwrap(); + assert!(!blobs.is_empty()); + let block_root = signed_block.canonical_root(); + + // Process the block without blobs so that it doesn't become available. + harness.advance_slot(); + let rpc_block = harness + .build_rpc_block_from_blobs(block_root, signed_block.clone(), None) + .unwrap(); + let availability = harness + .chain + .process_block( + block_root, + rpc_block, + NotifyExecutionLayer::Yes, + BlockImportSource::RangeSync, + || Ok(()), + ) + .await + .unwrap(); + assert_eq!( + availability, + AvailabilityProcessingStatus::MissingComponents(slot, block_root) + ); + + // Build blob sidecars with invalid signatures in the block header. + let mut corrupt_block = (*signed_block).clone(); + *corrupt_block.signature_mut() = Signature::infinity().unwrap(); + + let max_len = harness + .chain + .spec + .max_blobs_per_block(slot.epoch(E::slots_per_epoch())) as usize; + let mut blob_sidecars = FixedBlobSidecarList::new(vec![None; max_len]); + for (i, (kzg_proof, blob)) in kzg_proofs.into_iter().zip(blobs).enumerate() { + let blob_sidecar = BlobSidecar::new(i, blob, &corrupt_block, kzg_proof).unwrap(); + blob_sidecars[i] = Some(Arc::new(blob_sidecar)); + } + + let err = harness + .chain + .process_rpc_blobs(slot, block_root, blob_sidecars) + .await + .unwrap_err(); + assert!(matches!( + err, + BlockError::InvalidSignature(InvalidSignature::ProposerSignature) + )); +} diff --git a/beacon_node/beacon_chain/tests/block_verification.rs b/beacon_node/beacon_chain/tests/block_verification.rs index 47f5be02cb..7dfef50ea1 100644 --- a/beacon_node/beacon_chain/tests/block_verification.rs +++ b/beacon_node/beacon_chain/tests/block_verification.rs @@ -4,6 +4,7 @@ use beacon_chain::block_verification_types::{AsBlock, ExecutedBlock, RpcBlock}; use beacon_chain::data_column_verification::CustodyDataColumn; use beacon_chain::{ AvailabilityProcessingStatus, BeaconChain, BeaconChainTypes, ExecutionPendingBlock, + custody_context::NodeCustodyType, test_utils::{ AttestationStrategy, BeaconChainHarness, BlockStrategy, EphemeralHarnessType, test_spec, }, @@ -45,7 +46,7 @@ async fn get_chain_segment() -> (Vec>, Vec (Vec>, Vec BeaconChainHarness> { let harness = BeaconChainHarness::builder(MainnetEthSpec) .default_spec() @@ -115,7 +116,7 @@ fn get_harness( ..ChainConfig::default() }) .keypairs(KEYPAIRS[0..validator_count].to_vec()) - .import_all_data_columns(supernode) + .node_custody_type(node_custody_type) .fresh_ephemeral_store() .mock_execution_layer() .build(); @@ -259,7 +260,7 @@ fn update_data_column_signed_header( #[tokio::test] async fn chain_segment_full_segment() { - let harness = get_harness(VALIDATOR_COUNT, false); + let harness = get_harness(VALIDATOR_COUNT, NodeCustodyType::Fullnode); let (chain_segment, chain_segment_blobs) = get_chain_segment().await; let blocks: Vec> = chain_segment_blocks(&chain_segment, &chain_segment_blobs) .into_iter() @@ -297,7 +298,7 @@ async fn chain_segment_full_segment() { #[tokio::test] async fn chain_segment_varying_chunk_size() { for chunk_size in &[1, 2, 3, 5, 31, 32, 33, 42] { - let harness = get_harness(VALIDATOR_COUNT, false); + let harness = get_harness(VALIDATOR_COUNT, NodeCustodyType::Fullnode); let (chain_segment, chain_segment_blobs) = get_chain_segment().await; let blocks: Vec> = chain_segment_blocks(&chain_segment, &chain_segment_blobs) .into_iter() @@ -329,7 +330,7 @@ async fn chain_segment_varying_chunk_size() { #[tokio::test] async fn chain_segment_non_linear_parent_roots() { - let harness = get_harness(VALIDATOR_COUNT, false); + let harness = get_harness(VALIDATOR_COUNT, NodeCustodyType::Fullnode); let (chain_segment, chain_segment_blobs) = get_chain_segment().await; harness @@ -386,7 +387,7 @@ async fn chain_segment_non_linear_parent_roots() { #[tokio::test] async fn chain_segment_non_linear_slots() { - let harness = get_harness(VALIDATOR_COUNT, false); + let harness = get_harness(VALIDATOR_COUNT, NodeCustodyType::Fullnode); let (chain_segment, chain_segment_blobs) = get_chain_segment().await; harness .chain @@ -528,7 +529,7 @@ async fn assert_invalid_signature( async fn get_invalid_sigs_harness( chain_segment: &[BeaconSnapshot], ) -> BeaconChainHarness> { - let harness = get_harness(VALIDATOR_COUNT, false); + let harness = get_harness(VALIDATOR_COUNT, NodeCustodyType::Fullnode); harness .chain .slot_clock @@ -986,7 +987,7 @@ fn unwrap_err(result: Result) -> U { #[tokio::test] async fn block_gossip_verification() { - let harness = get_harness(VALIDATOR_COUNT, false); + let harness = get_harness(VALIDATOR_COUNT, NodeCustodyType::Fullnode); let (chain_segment, chain_segment_blobs) = get_chain_segment().await; let block_index = CHAIN_SEGMENT_LENGTH - 2; @@ -1389,7 +1390,7 @@ async fn verify_block_for_gossip_slashing_detection() { #[tokio::test] async fn verify_block_for_gossip_doppelganger_detection() { - let harness = get_harness(VALIDATOR_COUNT, false); + let harness = get_harness(VALIDATOR_COUNT, NodeCustodyType::Fullnode); let state = harness.get_current_state(); let ((block, _), _) = harness.make_block(state.clone(), Slot::new(1)).await; diff --git a/beacon_node/beacon_chain/tests/column_verification.rs b/beacon_node/beacon_chain/tests/column_verification.rs new file mode 100644 index 0000000000..229ae1e199 --- /dev/null +++ b/beacon_node/beacon_chain/tests/column_verification.rs @@ -0,0 +1,117 @@ +#![cfg(not(debug_assertions))] + +use beacon_chain::custody_context::NodeCustodyType; +use beacon_chain::test_utils::{ + AttestationStrategy, BeaconChainHarness, BlockStrategy, EphemeralHarnessType, + generate_data_column_sidecars_from_block, test_spec, +}; +use beacon_chain::{ + AvailabilityProcessingStatus, BlockError, ChainConfig, InvalidSignature, NotifyExecutionLayer, + block_verification_types::AsBlock, +}; +use logging::create_test_tracing_subscriber; +use std::sync::{Arc, LazyLock}; +use types::*; + +type E = MainnetEthSpec; + +// Should ideally be divisible by 3. +const VALIDATOR_COUNT: usize = 24; + +/// A cached set of keys. +static KEYPAIRS: LazyLock> = + LazyLock::new(|| types::test_utils::generate_deterministic_keypairs(VALIDATOR_COUNT)); + +fn get_harness( + validator_count: usize, + spec: Arc, + node_custody_type: NodeCustodyType, +) -> BeaconChainHarness> { + create_test_tracing_subscriber(); + let harness = BeaconChainHarness::builder(MainnetEthSpec) + .spec(spec) + .chain_config(ChainConfig { + reconstruct_historic_states: true, + ..ChainConfig::default() + }) + .keypairs(KEYPAIRS[0..validator_count].to_vec()) + .node_custody_type(node_custody_type) + .fresh_ephemeral_store() + .mock_execution_layer() + .build(); + + harness.advance_slot(); + + harness +} + +// Regression test for https://github.com/sigp/lighthouse/issues/7650 +#[tokio::test] +async fn rpc_columns_with_invalid_header_signature() { + let spec = Arc::new(test_spec::()); + + // Only run this test if columns are enabled. + if !spec.is_fulu_scheduled() { + return; + } + + let harness = get_harness(VALIDATOR_COUNT, spec, NodeCustodyType::Supernode); + + let num_blocks = E::slots_per_epoch() as usize; + + // Add some chain depth. + harness + .extend_chain( + num_blocks, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + // Produce a block with blobs. + harness.execution_block_generator().set_min_blob_count(1); + let head_state = harness.get_current_state(); + let slot = head_state.slot() + 1; + let ((signed_block, opt_blobs), _) = harness.make_block(head_state, slot).await; + let (_, blobs) = opt_blobs.unwrap(); + assert!(!blobs.is_empty()); + let block_root = signed_block.canonical_root(); + + // Process the block without blobs so that it doesn't become available. + harness.advance_slot(); + let rpc_block = harness + .build_rpc_block_from_blobs(block_root, signed_block.clone(), None) + .unwrap(); + let availability = harness + .chain + .process_block( + block_root, + rpc_block, + NotifyExecutionLayer::Yes, + BlockImportSource::RangeSync, + || Ok(()), + ) + .await + .unwrap(); + assert_eq!( + availability, + AvailabilityProcessingStatus::MissingComponents(slot, block_root) + ); + + // Build blob sidecars with invalid signatures in the block header. + let mut corrupt_block = (*signed_block).clone(); + *corrupt_block.signature_mut() = Signature::infinity().unwrap(); + + let data_column_sidecars = + generate_data_column_sidecars_from_block(&corrupt_block, &harness.chain.spec); + + let err = harness + .chain + .process_rpc_custody_columns(data_column_sidecars) + .await + .unwrap_err(); + assert!(matches!( + err, + BlockError::InvalidSignature(InvalidSignature::ProposerSignature) + )); +} diff --git a/beacon_node/beacon_chain/tests/events.rs b/beacon_node/beacon_chain/tests/events.rs index 0fc097ae8f..466058eea3 100644 --- a/beacon_node/beacon_chain/tests/events.rs +++ b/beacon_node/beacon_chain/tests/events.rs @@ -1,15 +1,13 @@ use beacon_chain::blob_verification::GossipVerifiedBlob; use beacon_chain::data_column_verification::GossipVerifiedDataColumn; -use beacon_chain::test_utils::{BeaconChainHarness, TEST_DATA_COLUMN_SIDECARS_SSZ}; +use beacon_chain::test_utils::{BeaconChainHarness, generate_data_column_sidecars_from_block}; use eth2::types::{EventKind, SseBlobSidecar, SseDataColumnSidecar}; use rand::SeedableRng; use rand::rngs::StdRng; use std::sync::Arc; use types::blob_sidecar::FixedBlobSidecarList; use types::test_utils::TestRandom; -use types::{ - BlobSidecar, DataColumnSidecar, EthSpec, ForkName, MinimalEthSpec, RuntimeVariableList, Slot, -}; +use types::{BlobSidecar, DataColumnSidecar, EthSpec, ForkName, MinimalEthSpec, Slot}; type E = MinimalEthSpec; @@ -108,19 +106,18 @@ async fn blob_sidecar_event_on_process_rpc_blobs() { let mut blob_event_receiver = event_handler.subscribe_blob_sidecar(); // build and process multiple rpc blobs - let kzg = harness.chain.kzg.as_ref(); - let mut rng = StdRng::seed_from_u64(0xDEADBEEF0BAD5EEDu64); + harness.execution_block_generator().set_min_blob_count(2); - let mut blob_1 = BlobSidecar::random_valid(&mut rng, kzg).unwrap(); - let mut blob_2 = BlobSidecar { - index: 1, - ..BlobSidecar::random_valid(&mut rng, kzg).unwrap() - }; - let parent_root = harness.chain.head().head_block_root(); - blob_1.signed_block_header.message.parent_root = parent_root; - blob_2.signed_block_header.message.parent_root = parent_root; - let blob_1 = Arc::new(blob_1); - let blob_2 = Arc::new(blob_2); + let head_state = harness.get_current_state(); + let slot = head_state.slot() + 1; + let ((signed_block, opt_blobs), _) = harness.make_block(head_state, slot).await; + let (kzg_proofs, blobs) = opt_blobs.unwrap(); + assert!(blobs.len() > 2); + + let blob_1 = + Arc::new(BlobSidecar::new(0, blobs[0].clone(), &signed_block, kzg_proofs[0]).unwrap()); + let blob_2 = + Arc::new(BlobSidecar::new(1, blobs[1].clone(), &signed_block, kzg_proofs[1]).unwrap()); let blobs = FixedBlobSidecarList::new(vec![Some(blob_1.clone()), Some(blob_2.clone())]); let expected_sse_blobs = vec![ @@ -130,7 +127,7 @@ async fn blob_sidecar_event_on_process_rpc_blobs() { let _ = harness .chain - .process_rpc_blobs(blob_1.slot(), blob_1.block_root(), blobs) + .process_rpc_blobs(slot, blob_1.block_root(), blobs) .await .unwrap(); @@ -159,20 +156,24 @@ async fn data_column_sidecar_event_on_process_rpc_columns() { let event_handler = harness.chain.event_handler.as_ref().unwrap(); let mut data_column_event_receiver = event_handler.subscribe_data_column_sidecar(); + // build a valid block + harness.execution_block_generator().set_min_blob_count(1); + + let head_state = harness.get_current_state(); + let slot = head_state.slot() + 1; + let ((signed_block, opt_blobs), _) = harness.make_block(head_state, slot).await; + let (_, blobs) = opt_blobs.unwrap(); + assert!(!blobs.is_empty()); + // load the precomputed column sidecar to avoid computing them for every block in the tests. - let mut sidecar = RuntimeVariableList::>::from_ssz_bytes( - TEST_DATA_COLUMN_SIDECARS_SSZ, - E::number_of_columns(), - ) - .unwrap()[0] - .clone(); - let parent_root = harness.chain.head().head_block_root(); - sidecar.signed_block_header.message.parent_root = parent_root; + let data_column_sidecars = + generate_data_column_sidecars_from_block(&signed_block, &harness.chain.spec); + let sidecar = data_column_sidecars[0].clone(); let expected_sse_data_column = SseDataColumnSidecar::from_data_column_sidecar(&sidecar); let _ = harness .chain - .process_rpc_custody_columns(vec![Arc::new(sidecar)]) + .process_rpc_custody_columns(vec![sidecar]) .await .unwrap(); diff --git a/beacon_node/beacon_chain/tests/main.rs b/beacon_node/beacon_chain/tests/main.rs index f0978c5f05..aec4416419 100644 --- a/beacon_node/beacon_chain/tests/main.rs +++ b/beacon_node/beacon_chain/tests/main.rs @@ -1,8 +1,10 @@ mod attestation_production; mod attestation_verification; mod bellatrix; +mod blob_verification; mod block_verification; mod capella; +mod column_verification; mod events; mod op_verification; mod payload_invalidation; diff --git a/beacon_node/beacon_chain/tests/store_tests.rs b/beacon_node/beacon_chain/tests/store_tests.rs index 0b7004781f..b94490b4d5 100644 --- a/beacon_node/beacon_chain/tests/store_tests.rs +++ b/beacon_node/beacon_chain/tests/store_tests.rs @@ -3,7 +3,9 @@ use beacon_chain::attestation_verification::Error as AttnError; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::builder::BeaconChainBuilder; +use beacon_chain::custody_context::CUSTODY_CHANGE_DA_EFFECTIVE_DELAY_SECONDS; use beacon_chain::data_availability_checker::AvailableBlock; +use beacon_chain::historical_data_columns::HistoricalDataColumnError; use beacon_chain::schema_change::migrate_schema; use beacon_chain::test_utils::SyncCommitteeStrategy; use beacon_chain::test_utils::{ @@ -13,7 +15,12 @@ use beacon_chain::test_utils::{ use beacon_chain::{ BeaconChain, BeaconChainError, BeaconChainTypes, BeaconSnapshot, BlockError, ChainConfig, NotifyExecutionLayer, ServerSentEventHandler, WhenSlotSkipped, - data_availability_checker::MaybeAvailableBlock, historical_blocks::HistoricalBlockError, + beacon_proposer_cache::{ + compute_proposer_duties_from_head, ensure_state_can_determine_proposers_for_epoch, + }, + custody_context::NodeCustodyType, + data_availability_checker::MaybeAvailableBlock, + historical_blocks::HistoricalBlockError, migrate::MigratorConfig, }; use logging::create_test_tracing_subscriber; @@ -92,7 +99,12 @@ fn get_harness( reconstruct_historic_states: true, ..ChainConfig::default() }; - get_harness_generic(store, validator_count, chain_config, false) + get_harness_generic( + store, + validator_count, + chain_config, + NodeCustodyType::Fullnode, + ) } fn get_harness_import_all_data_columns( @@ -104,14 +116,19 @@ fn get_harness_import_all_data_columns( reconstruct_historic_states: true, ..ChainConfig::default() }; - get_harness_generic(store, validator_count, chain_config, true) + get_harness_generic( + store, + validator_count, + chain_config, + NodeCustodyType::Supernode, + ) } fn get_harness_generic( store: Arc, BeaconNodeBackend>>, validator_count: usize, chain_config: ChainConfig, - import_all_data_columns: bool, + node_custody_type: NodeCustodyType, ) -> TestHarness { let harness = TestHarness::builder(MinimalEthSpec) .spec(store.get_chain_spec().clone()) @@ -119,7 +136,7 @@ fn get_harness_generic( .fresh_disk_store(store) .mock_execution_layer() .chain_config(chain_config) - .import_all_data_columns(import_all_data_columns) + .node_custody_type(node_custody_type) .build(); harness.advance_slot(); harness @@ -1273,19 +1290,34 @@ async fn proposer_shuffling_root_consistency_test( #[tokio::test] async fn proposer_shuffling_root_consistency_same_epoch() { let spec = test_spec::(); - proposer_shuffling_root_consistency_test(spec, 32, 39).await; + proposer_shuffling_root_consistency_test( + spec, + 4 * E::slots_per_epoch(), + 5 * E::slots_per_epoch() - 1, + ) + .await; } #[tokio::test] async fn proposer_shuffling_root_consistency_next_epoch() { let spec = test_spec::(); - proposer_shuffling_root_consistency_test(spec, 32, 47).await; + proposer_shuffling_root_consistency_test( + spec, + 4 * E::slots_per_epoch(), + 6 * E::slots_per_epoch() - 1, + ) + .await; } #[tokio::test] async fn proposer_shuffling_root_consistency_two_epochs() { let spec = test_spec::(); - proposer_shuffling_root_consistency_test(spec, 32, 55).await; + proposer_shuffling_root_consistency_test( + spec, + 4 * E::slots_per_epoch(), + 7 * E::slots_per_epoch() - 1, + ) + .await; } #[tokio::test] @@ -1501,6 +1533,120 @@ async fn proposer_shuffling_changing_with_lookahead() { ); } +#[tokio::test] +async fn proposer_duties_from_head_fulu() { + let spec = ForkName::Fulu.make_genesis_spec(E::default_spec()); + + let db_path = tempdir().unwrap(); + let store = get_store_generic(&db_path, Default::default(), spec.clone()); + let validators_keypairs = + types::test_utils::generate_deterministic_keypairs(LOW_VALIDATOR_COUNT); + let harness = TestHarness::builder(MinimalEthSpec) + .spec(spec.into()) + .keypairs(validators_keypairs) + .fresh_disk_store(store) + .mock_execution_layer() + .build(); + let spec = &harness.chain.spec; + + let initial_blocks = E::slots_per_epoch() * 3; + + // Build chain out to parent block. + let initial_slots: Vec = (1..=initial_blocks).map(Into::into).collect(); + let (state, state_root) = harness.get_current_state_and_root(); + let all_validators = harness.get_all_validators(); + let (_, _, head_block_root, head_state) = harness + .add_attested_blocks_at_slots(state, state_root, &initial_slots, &all_validators) + .await; + + // Compute the proposer duties at the next epoch from the head + let next_epoch = head_state.next_epoch().unwrap(); + let (_indices, dependent_root, _, fork) = + compute_proposer_duties_from_head(next_epoch, &harness.chain).unwrap(); + + assert_eq!( + dependent_root, + head_state + .proposer_shuffling_decision_root_at_epoch(next_epoch, head_block_root.into(), spec) + .unwrap() + ); + assert_eq!(fork, head_state.fork()); +} + +/// Test that we can compute the proposer shuffling for the Gloas fork epoch itself using lookahead! +#[tokio::test] +async fn proposer_lookahead_gloas_fork_epoch() { + let gloas_fork_epoch = Epoch::new(4); + let mut spec = ForkName::Fulu.make_genesis_spec(E::default_spec()); + spec.gloas_fork_epoch = Some(gloas_fork_epoch); + + let db_path = tempdir().unwrap(); + let store = get_store_generic(&db_path, Default::default(), spec.clone()); + let validators_keypairs = + types::test_utils::generate_deterministic_keypairs(LOW_VALIDATOR_COUNT); + let harness = TestHarness::builder(E::default()) + .spec(spec.into()) + .keypairs(validators_keypairs) + .fresh_disk_store(store) + .mock_execution_layer() + .build(); + let spec = &harness.chain.spec; + + let initial_blocks = (gloas_fork_epoch - 1) + .start_slot(E::slots_per_epoch()) + .as_u64(); + + // Build chain out to parent block. + let initial_slots: Vec = (1..=initial_blocks).map(Into::into).collect(); + let (state, state_root) = harness.get_current_state_and_root(); + let all_validators = harness.get_all_validators(); + let (_, _, head_block_root, mut head_state) = harness + .add_attested_blocks_at_slots(state, state_root, &initial_slots, &all_validators) + .await; + let head_state_root = head_state.canonical_root().unwrap(); + + // Check that we have access to the next epoch shuffling according to + // `ensure_state_can_determine_proposers_for_epoch`. + ensure_state_can_determine_proposers_for_epoch( + &mut head_state, + head_state_root, + gloas_fork_epoch, + spec, + ) + .unwrap(); + assert_eq!(head_state.current_epoch(), gloas_fork_epoch - 1); + + // Compute the proposer duties at the fork epoch from the head. + let (indices, dependent_root, _, fork) = + compute_proposer_duties_from_head(gloas_fork_epoch, &harness.chain).unwrap(); + + assert_eq!( + dependent_root, + head_state + .proposer_shuffling_decision_root_at_epoch( + gloas_fork_epoch, + head_block_root.into(), + spec + ) + .unwrap() + ); + assert_ne!(fork, head_state.fork()); + assert_eq!(fork, spec.fork_at_epoch(gloas_fork_epoch)); + + // Build a block in the Gloas fork epoch and assert that the shuffling does not change. + let gloas_slots = vec![gloas_fork_epoch.start_slot(E::slots_per_epoch())]; + let (_, _, _, _) = harness + .add_attested_blocks_at_slots(head_state, head_state_root, &gloas_slots, &all_validators) + .await; + + let (no_lookahead_indices, no_lookahead_dependent_root, _, no_lookahead_fork) = + compute_proposer_duties_from_head(gloas_fork_epoch, &harness.chain).unwrap(); + + assert_eq!(no_lookahead_indices, indices); + assert_eq!(no_lookahead_dependent_root, dependent_root); + assert_eq!(no_lookahead_fork, fork); +} + // Ensure blocks from abandoned forks are pruned from the Hot DB #[tokio::test] async fn prunes_abandoned_fork_between_two_finalized_checkpoints() { @@ -3036,6 +3182,245 @@ async fn weak_subjectivity_sync_test( assert_eq!(store.get_anchor_info().state_upper_limit, Slot::new(0)); } +#[tokio::test] +async fn test_import_historical_data_columns_batch() { + let spec = ForkName::Fulu.make_genesis_spec(E::default_spec()); + let db_path = tempdir().unwrap(); + let store = get_store_generic(&db_path, StoreConfig::default(), spec); + let start_slot = Epoch::new(0).start_slot(E::slots_per_epoch()) + 1; + let end_slot = Epoch::new(0).end_slot(E::slots_per_epoch()); + + let harness = get_harness_import_all_data_columns(store.clone(), LOW_VALIDATOR_COUNT); + + harness + .extend_chain( + (E::slots_per_epoch() * 2) as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + harness.advance_slot(); + + let block_root_iter = harness + .chain + .forwards_iter_block_roots_until(start_slot, end_slot) + .unwrap(); + + let mut data_columns_list = vec![]; + + for block in block_root_iter { + let (block_root, _) = block.unwrap(); + let data_columns = harness.chain.store.get_data_columns(&block_root).unwrap(); + assert!(data_columns.is_some()); + for data_column in data_columns.unwrap() { + data_columns_list.push(data_column); + } + } + + harness + .extend_chain( + (E::slots_per_epoch() * 4) as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + harness.advance_slot(); + + harness + .chain + .store + .try_prune_blobs(true, Epoch::new(2)) + .unwrap(); + + let block_root_iter = harness + .chain + .forwards_iter_block_roots_until(start_slot, end_slot) + .unwrap(); + + for block in block_root_iter { + let (block_root, _) = block.unwrap(); + let data_columns = harness.chain.store.get_data_columns(&block_root).unwrap(); + assert!(data_columns.is_none()) + } + + harness + .chain + .import_historical_data_column_batch(Epoch::new(0), data_columns_list) + .unwrap(); + let block_root_iter = harness + .chain + .forwards_iter_block_roots_until(start_slot, end_slot) + .unwrap(); + + for block in block_root_iter { + let (block_root, _) = block.unwrap(); + let data_columns = harness.chain.store.get_data_columns(&block_root).unwrap(); + assert!(data_columns.is_some()) + } +} + +// This should verify that a data column sidecar containing mismatched block roots should fail to be imported. +#[tokio::test] +async fn test_import_historical_data_columns_batch_mismatched_block_root() { + let spec = ForkName::Fulu.make_genesis_spec(E::default_spec()); + let db_path = tempdir().unwrap(); + let store = get_store_generic(&db_path, StoreConfig::default(), spec); + let start_slot = Slot::new(1); + let end_slot = Slot::new(E::slots_per_epoch() * 2 - 1); + + let harness = get_harness_import_all_data_columns(store.clone(), LOW_VALIDATOR_COUNT); + + harness + .extend_chain( + (E::slots_per_epoch() * 2) as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + harness.advance_slot(); + + let block_root_iter = harness + .chain + .forwards_iter_block_roots_until(start_slot, end_slot) + .unwrap(); + + let mut data_columns_list = vec![]; + + for block in block_root_iter { + let (block_root, _) = block.unwrap(); + let data_columns = harness.chain.store.get_data_columns(&block_root).unwrap(); + assert!(data_columns.is_some()); + + for data_column in data_columns.unwrap() { + let mut data_column = (*data_column).clone(); + if data_column.index % 2 == 0 { + data_column.signed_block_header.message.body_root = Hash256::ZERO; + } + + data_columns_list.push(Arc::new(data_column)); + } + } + + harness + .extend_chain( + (E::slots_per_epoch() * 4) as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + harness.advance_slot(); + + harness + .chain + .store + .try_prune_blobs(true, Epoch::new(2)) + .unwrap(); + + let block_root_iter = harness + .chain + .forwards_iter_block_roots_until(start_slot, end_slot) + .unwrap(); + + for block in block_root_iter { + let (block_root, _) = block.unwrap(); + let data_columns = harness.chain.store.get_data_columns(&block_root).unwrap(); + assert!(data_columns.is_none()) + } + + let error = harness + .chain + .import_historical_data_column_batch( + start_slot.epoch(E::slots_per_epoch()), + data_columns_list, + ) + .unwrap_err(); + + assert!(matches!( + error, + HistoricalDataColumnError::NoBlockFound { .. } + )); +} + +// This should verify that a data column sidecar associated to a block root that doesn't exist in the store cannot +// be imported. +#[tokio::test] +async fn test_import_historical_data_columns_batch_no_block_found() { + let spec = ForkName::Fulu.make_genesis_spec(E::default_spec()); + let db_path = tempdir().unwrap(); + let store = get_store_generic(&db_path, StoreConfig::default(), spec); + let start_slot = Slot::new(1); + let end_slot = Slot::new(E::slots_per_epoch() * 2 - 1); + + let harness = get_harness_import_all_data_columns(store.clone(), LOW_VALIDATOR_COUNT); + + harness + .extend_chain( + (E::slots_per_epoch() * 2) as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + harness.advance_slot(); + + let block_root_iter = harness + .chain + .forwards_iter_block_roots_until(start_slot, end_slot) + .unwrap(); + + let mut data_columns_list = vec![]; + + for block in block_root_iter { + let (block_root, _) = block.unwrap(); + let data_columns = harness.chain.store.get_data_columns(&block_root).unwrap(); + assert!(data_columns.is_some()); + + for data_column in data_columns.unwrap() { + let mut data_column = (*data_column).clone(); + data_column.signed_block_header.message.body_root = Hash256::ZERO; + data_columns_list.push(Arc::new(data_column)); + } + } + + harness + .extend_chain( + (E::slots_per_epoch() * 4) as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + harness.advance_slot(); + + harness + .chain + .store + .try_prune_blobs(true, Epoch::new(2)) + .unwrap(); + + let block_root_iter = harness + .chain + .forwards_iter_block_roots_until(start_slot, end_slot) + .unwrap(); + + for block in block_root_iter { + let (block_root, _) = block.unwrap(); + let data_columns = harness.chain.store.get_data_columns(&block_root).unwrap(); + assert!(data_columns.is_none()) + } + + let error = harness + .chain + .import_historical_data_column_batch(Epoch::new(0), data_columns_list) + .unwrap_err(); + + assert!(matches!( + error, + HistoricalDataColumnError::NoBlockFound { .. } + )); +} + /// Test that blocks and attestations that refer to states around an unaligned split state are /// processed correctly. #[tokio::test] @@ -3046,7 +3431,12 @@ async fn process_blocks_and_attestations_for_unaligned_checkpoint() { reconstruct_historic_states: false, ..ChainConfig::default() }; - let harness = get_harness_generic(store.clone(), LOW_VALIDATOR_COUNT, chain_config, false); + let harness = get_harness_generic( + store.clone(), + LOW_VALIDATOR_COUNT, + chain_config, + NodeCustodyType::Fullnode, + ); let all_validators = (0..LOW_VALIDATOR_COUNT).collect::>(); @@ -3465,14 +3855,13 @@ async fn schema_downgrade_to_min_version( reconstruct_historic_states, ..ChainConfig::default() }; - let import_all_data_columns = false; let store = get_store_generic(&db_path, store_config.clone(), spec.clone()); let harness = get_harness_generic( store.clone(), LOW_VALIDATOR_COUNT, chain_config.clone(), - import_all_data_columns, + NodeCustodyType::Fullnode, ); harness @@ -3606,9 +3995,10 @@ async fn deneb_prune_blobs_happy_case() { let store = get_store(&db_path); if store.get_chain_spec().is_peer_das_scheduled() { - // TODO(fulu): add prune tests for Fulu / PeerDAS data columns. + // Blob pruning no longer needed since Fulu / PeerDAS return; } + let Some(deneb_fork_epoch) = store.get_chain_spec().deneb_fork_epoch else { // No-op prior to Deneb. return; @@ -3657,9 +4047,10 @@ async fn deneb_prune_blobs_no_finalization() { let store = get_store(&db_path); if store.get_chain_spec().is_peer_das_scheduled() { - // TODO(fulu): add prune tests for Fulu / PeerDAS data columns. + // Blob pruning no longer needed since Fulu / PeerDAS return; } + let Some(deneb_fork_epoch) = store.get_chain_spec().deneb_fork_epoch else { // No-op prior to Deneb. return; @@ -3716,29 +4107,39 @@ async fn deneb_prune_blobs_no_finalization() { /// Check that blob pruning does not fail trying to prune across the fork boundary. #[tokio::test] -async fn deneb_prune_blobs_fork_boundary() { - let deneb_fork_epoch = Epoch::new(4); +async fn prune_blobs_across_fork_boundary() { let mut spec = ForkName::Capella.make_genesis_spec(E::default_spec()); + + let deneb_fork_epoch = Epoch::new(4); spec.deneb_fork_epoch = Some(deneb_fork_epoch); let deneb_fork_slot = deneb_fork_epoch.start_slot(E::slots_per_epoch()); + let electra_fork_epoch = Epoch::new(8); + spec.electra_fork_epoch = Some(electra_fork_epoch); + + let fulu_fork_epoch = Epoch::new(12); + spec.fulu_fork_epoch = Some(fulu_fork_epoch); + let db_path = tempdir().unwrap(); let store = get_store_generic(&db_path, StoreConfig::default(), spec); let harness = get_harness(store.clone(), LOW_VALIDATOR_COUNT); - let num_blocks = E::slots_per_epoch() * 7; + let blocks_to_deneb_finalization = E::slots_per_epoch() * 7; + let blocks_to_electra_finalization = E::slots_per_epoch() * 4; + let blocks_to_fulu_finalization = E::slots_per_epoch() * 4; - // Finalize to epoch 5. + // Extend the chain to epoch 7 + // Finalize to epoch 5 (Deneb). harness .extend_chain( - num_blocks as usize, + blocks_to_deneb_finalization as usize, BlockStrategy::OnCanonicalHead, AttestationStrategy::AllValidators, ) .await; - // Finalization should be at epoch 5. + // Finalization should be at epoch 5 (Deneb). let finalized_epoch = Epoch::new(5); let finalized_slot = finalized_epoch.start_slot(E::slots_per_epoch()); assert_eq!( @@ -3777,6 +4178,116 @@ async fn deneb_prune_blobs_fork_boundary() { assert_eq!(store.get_blob_info().oldest_blob_slot, Some(pruned_slot)); check_blob_existence(&harness, Slot::new(0), pruned_slot - 1, false); check_blob_existence(&harness, pruned_slot, harness.head_slot(), true); + + // Extend the chain to epoch 11 + // Finalize to epoch 9 (Electra) + harness.advance_slot(); + harness + .extend_chain( + blocks_to_electra_finalization as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + // Finalization should be at epoch 9 (Electra). + let finalized_epoch = Epoch::new(9); + let finalized_slot = finalized_epoch.start_slot(E::slots_per_epoch()); + assert_eq!( + harness.get_current_state().finalized_checkpoint().epoch, + finalized_epoch + ); + assert_eq!(store.get_split_slot(), finalized_slot); + + // All blobs since last pruning during Deneb should still be available. + assert_eq!(store.get_blob_info().oldest_blob_slot, Some(pruned_slot)); + + let electra_first_slot = electra_fork_epoch.start_slot(E::slots_per_epoch()); + // Check that blobs exist from the pruned slot to electra + check_blob_existence(&harness, pruned_slot, electra_first_slot - 1, true); + + // Trigger pruning on Electra + let pruned_slot = (electra_fork_epoch + 1).start_slot(E::slots_per_epoch()); + + store.try_prune_blobs(true, finalized_epoch).unwrap(); + assert_eq!(store.get_blob_info().oldest_blob_slot, Some(finalized_slot)); + check_blob_existence(&harness, Slot::new(0), pruned_slot - 1, false); + check_blob_existence(&harness, pruned_slot, harness.head_slot(), true); + + // Check that blobs have been pruned up to the pruned slot + check_blob_existence(&harness, Slot::new(0), pruned_slot - 1, false); + // Check that blobs exist from electra to the current head + check_blob_existence(&harness, electra_first_slot, harness.head_slot(), true); + + // Extend the chain to epoch 15 + // Finalize to epoch 13 (Fulu) + harness.advance_slot(); + harness + .extend_chain( + blocks_to_fulu_finalization as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + // Finalization should be at epoch 13 (Fulu). + let finalized_epoch = Epoch::new(13); + let finalized_slot = finalized_epoch.start_slot(E::slots_per_epoch()); + assert_eq!( + harness.get_current_state().finalized_checkpoint().epoch, + finalized_epoch + ); + assert_eq!(store.get_split_slot(), finalized_slot); + + // All blobs since last pruning during Electra should still be available. + assert_eq!(store.get_blob_info().oldest_blob_slot, Some(pruned_slot)); + + let fulu_first_slot = fulu_fork_epoch.start_slot(E::slots_per_epoch()); + // Check that blobs have been pruned up to the pruned slot + check_blob_existence(&harness, Slot::new(0), pruned_slot - 1, false); + // Check that blobs exist from the pruned slot to Fulu + check_blob_existence(&harness, pruned_slot, fulu_first_slot - 1, true); + // Check that blobs do not exist from Fulu to the current head + check_blob_existence(&harness, fulu_first_slot, harness.head_slot(), false); + + // Attempt pruning with at different epochs. No pruning should occur for epochs + // preceding Fulu, as we have already triggered pruning pre-Fulu. Pruning should occur + // for epochs after Fulu. + assert!(fulu_fork_epoch < finalized_epoch); + for data_availability_boundary in [ + Epoch::new(7), + electra_fork_epoch, + Epoch::new(9), + Epoch::new(11), + fulu_fork_epoch, + Epoch::new(15), + ] { + store + .try_prune_blobs(true, data_availability_boundary) + .unwrap(); + + let oldest_slot = data_availability_boundary.start_slot(E::slots_per_epoch()); + + if data_availability_boundary < fulu_fork_epoch { + // Pre Fulu fork epochs + // Check oldest blob slot is not updated. + assert!(store.get_blob_info().oldest_blob_slot >= Some(oldest_slot)); + check_blob_existence(&harness, Slot::new(0), oldest_slot - 1, false); + // Blobs should exist + check_blob_existence(&harness, oldest_slot, harness.head_slot(), true); + } else { + // Fulu fork epochs + // Pruning should have been triggered + assert!(store.get_blob_info().oldest_blob_slot <= Some(oldest_slot)); + // Oldest blost slot should never be greater than the first fulu slot + let fulu_first_slot = fulu_fork_epoch.start_slot(E::slots_per_epoch()); + assert!(store.get_blob_info().oldest_blob_slot <= Some(fulu_first_slot)); + // Blobs should not exist post-Fulu + check_blob_existence(&harness, oldest_slot, harness.head_slot(), false); + // Data columns should exist post-Fulu + check_data_column_existence(&harness, oldest_slot, harness.head_slot(), true); + }; + } } /// Check that blob pruning prunes blobs older than the data availability boundary with margin @@ -3805,9 +4316,10 @@ async fn deneb_prune_blobs_margin_test(margin: u64) { let store = get_store_generic(&db_path, config, test_spec::()); if store.get_chain_spec().is_peer_das_scheduled() { - // TODO(fulu): add prune tests for Fulu / PeerDAS data columns. + // Blob pruning no longer needed since Fulu / PeerDAS return; } + let Some(deneb_fork_epoch) = store.get_chain_spec().deneb_fork_epoch else { // No-op prior to Deneb. return; @@ -3917,6 +4429,368 @@ fn check_blob_existence( } } +/// Check that blob pruning prunes data columns older than the data availability boundary. +#[tokio::test] +async fn fulu_prune_data_columns_happy_case() { + let db_path = tempdir().unwrap(); + let store = get_store(&db_path); + + if !store.get_chain_spec().is_peer_das_scheduled() { + // No-op if PeerDAS not scheduled. + return; + } + let Some(fulu_fork_epoch) = store.get_chain_spec().fulu_fork_epoch else { + // No-op prior to Fulu. + return; + }; + let fulu_fork_slot = fulu_fork_epoch.start_slot(E::slots_per_epoch()); + + let num_blocks_produced = E::slots_per_epoch() * 8; + let harness = get_harness(store.clone(), LOW_VALIDATOR_COUNT); + + harness + .extend_chain( + num_blocks_produced as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + // Prior to manual pruning with an artifically low data availability boundary all data columns + // should be stored. + assert_eq!( + store.get_data_column_info().oldest_data_column_slot, + Some(fulu_fork_slot) + ); + check_data_column_existence(&harness, Slot::new(1), harness.head_slot(), true); + + // Trigger pruning of data columns older than epoch 2. + let data_availability_boundary = Epoch::new(2); + store + .try_prune_blobs(true, data_availability_boundary) + .unwrap(); + + // Check oldest data column slot is updated accordingly and prior data columns have been + // deleted. + let oldest_data_column_slot = store + .get_data_column_info() + .oldest_data_column_slot + .unwrap(); + assert_eq!( + oldest_data_column_slot, + data_availability_boundary.start_slot(E::slots_per_epoch()) + ); + check_data_column_existence(&harness, Slot::new(0), oldest_data_column_slot - 1, false); + check_data_column_existence(&harness, oldest_data_column_slot, harness.head_slot(), true); +} + +/// Check that blob pruning does not prune data columns without finalization. +#[tokio::test] +async fn fulu_prune_data_columns_no_finalization() { + let db_path = tempdir().unwrap(); + let store = get_store(&db_path); + + if !store.get_chain_spec().is_peer_das_scheduled() { + // No-op if PeerDAS not scheduled. + return; + } + let Some(fulu_fork_epoch) = store.get_chain_spec().fulu_fork_epoch else { + // No-op prior to Fulu. + return; + }; + let fulu_fork_slot = fulu_fork_epoch.start_slot(E::slots_per_epoch()); + + let initial_num_blocks = E::slots_per_epoch() * 5; + let harness = get_harness(store.clone(), LOW_VALIDATOR_COUNT); + + // Finalize to epoch 3. + harness + .extend_chain( + initial_num_blocks as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + // Extend the chain for another few epochs without attestations. + let unfinalized_num_blocks = E::slots_per_epoch() * 3; + harness.advance_slot(); + harness + .extend_chain( + unfinalized_num_blocks as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::SomeValidators(vec![]), + ) + .await; + + // Finalization should be at epoch 3. + let finalized_slot = Slot::new(E::slots_per_epoch() * 3); + assert_eq!(harness.get_current_state().finalized_checkpoint().epoch, 3); + assert_eq!(store.get_split_slot(), finalized_slot); + + // All data columns should still be available. + assert_eq!( + store.get_data_column_info().oldest_data_column_slot, + Some(fulu_fork_slot) + ); + check_data_column_existence(&harness, Slot::new(0), harness.head_slot(), true); + + // Attempt pruning of data columns older than epoch 4, which is newer than finalization. + let data_availability_boundary = Epoch::new(4); + store + .try_prune_blobs(true, data_availability_boundary) + .unwrap(); + + // Check oldest data column slot is only updated to finalization, and NOT to the DAB. + let oldest_data_column_slot = store + .get_data_column_info() + .oldest_data_column_slot + .unwrap(); + assert_eq!(oldest_data_column_slot, finalized_slot); + check_data_column_existence(&harness, Slot::new(0), finalized_slot - 1, false); + check_data_column_existence(&harness, finalized_slot, harness.head_slot(), true); +} + +/// Check that data column pruning does not fail trying to prune across the fork boundary. +#[tokio::test] +async fn fulu_prune_data_columns_fork_boundary() { + let mut spec = ForkName::Electra.make_genesis_spec(E::default_spec()); + let fulu_fork_epoch = Epoch::new(4); + spec.fulu_fork_epoch = Some(fulu_fork_epoch); + let fulu_fork_slot = fulu_fork_epoch.start_slot(E::slots_per_epoch()); + + let db_path = tempdir().unwrap(); + let store = get_store_generic(&db_path, StoreConfig::default(), spec); + + if !store.get_chain_spec().is_peer_das_scheduled() { + // No-op if PeerDAS not scheduled. + panic!("PeerDAS not scheduled"); + //return; + } + + let harness = get_harness(store.clone(), LOW_VALIDATOR_COUNT); + + let num_blocks = E::slots_per_epoch() * 7; + + // Finalize to epoch 5. + harness + .extend_chain( + num_blocks as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + // Finalization should be at epoch 5. + let finalized_epoch = Epoch::new(5); + let finalized_slot = finalized_epoch.start_slot(E::slots_per_epoch()); + assert_eq!( + harness.get_current_state().finalized_checkpoint().epoch, + finalized_epoch + ); + assert_eq!(store.get_split_slot(), finalized_slot); + + // All data columns should still be available. + assert_eq!( + store.get_data_column_info().oldest_data_column_slot, + Some(fulu_fork_slot) + ); + check_data_column_existence(&harness, Slot::new(0), harness.head_slot(), true); + + // Attempt pruning with data availability epochs that precede the fork epoch. + // No pruning should occur. + assert!(fulu_fork_epoch < finalized_epoch); + for data_availability_boundary in [Epoch::new(0), Epoch::new(3), fulu_fork_epoch] { + store + .try_prune_blobs(true, data_availability_boundary) + .unwrap(); + + // Check oldest data column slot is not updated. + assert_eq!( + store.get_data_column_info().oldest_data_column_slot, + Some(fulu_fork_slot) + ); + } + // All data columns should still be available. + check_data_column_existence(&harness, Slot::new(0), harness.head_slot(), true); + + // Prune one epoch past the fork. + let pruned_slot = (fulu_fork_epoch + 1).start_slot(E::slots_per_epoch()); + store.try_prune_blobs(true, fulu_fork_epoch + 1).unwrap(); + assert_eq!( + store.get_data_column_info().oldest_data_column_slot, + Some(pruned_slot) + ); + check_data_column_existence(&harness, Slot::new(0), pruned_slot - 1, false); + check_data_column_existence(&harness, pruned_slot, harness.head_slot(), true); +} + +#[tokio::test] +async fn test_column_da_boundary() { + let mut spec = ForkName::Electra.make_genesis_spec(E::default_spec()); + let fulu_fork_epoch = Epoch::new(4); + spec.fulu_fork_epoch = Some(fulu_fork_epoch); + let db_path = tempdir().unwrap(); + let store = get_store_generic(&db_path, StoreConfig::default(), spec); + + if !store.get_chain_spec().is_peer_das_scheduled() { + // No-op if PeerDAS not scheduled. + panic!("PeerDAS not scheduled"); + } + + let harness = get_harness(store.clone(), LOW_VALIDATOR_COUNT); + + // The column da boundary should be the fulu fork epoch + assert_eq!( + harness.chain.column_data_availability_boundary(), + Some(fulu_fork_epoch) + ); +} + +#[tokio::test] +async fn test_earliest_custodied_data_column_epoch() { + let spec = ForkName::Fulu.make_genesis_spec(E::default_spec()); + let db_path = tempdir().unwrap(); + let store = get_store_generic(&db_path, StoreConfig::default(), spec); + let custody_info_epoch = Epoch::new(4); + + if !store.get_chain_spec().is_peer_das_scheduled() { + // No-op if PeerDAS not scheduled. + panic!("PeerDAS not scheduled"); + } + + let harness = get_harness(store.clone(), LOW_VALIDATOR_COUNT); + + // earliest custody info is set to the last slot in `custody_info_epoch` + harness + .chain + .update_data_column_custody_info(Some(custody_info_epoch.end_slot(E::slots_per_epoch()))); + + // earliest custodied data column epoch should be `custody_info_epoch` + 1 + assert_eq!( + harness.chain.earliest_custodied_data_column_epoch(), + Some(custody_info_epoch + 1) + ); + + // earliest custody info is set to the first slot in `custody_info_epoch` + harness + .chain + .update_data_column_custody_info(Some(custody_info_epoch.start_slot(E::slots_per_epoch()))); + + // earliest custodied data column epoch should be `custody_info_epoch` + assert_eq!( + harness.chain.earliest_custodied_data_column_epoch(), + Some(custody_info_epoch) + ); +} + +/// Check that blob pruning prunes data columns older than the data availability boundary with +/// margin applied. +#[tokio::test] +async fn fulu_prune_data_columns_margin1() { + fulu_prune_data_columns_margin_test(1).await; +} + +#[tokio::test] +async fn fulu_prune_data_columns_margin3() { + fulu_prune_data_columns_margin_test(3).await; +} + +#[tokio::test] +async fn fulu_prune_data_columns_margin4() { + fulu_prune_data_columns_margin_test(4).await; +} + +async fn fulu_prune_data_columns_margin_test(margin: u64) { + let config = StoreConfig { + blob_prune_margin_epochs: margin, + ..StoreConfig::default() + }; + let db_path = tempdir().unwrap(); + let store = get_store_generic(&db_path, config, test_spec::()); + + if !store.get_chain_spec().is_peer_das_scheduled() { + // No-op if PeerDAS not scheduled. + return; + } + let Some(fulu_fork_epoch) = store.get_chain_spec().fulu_fork_epoch else { + // No-op prior to Fulu. + return; + }; + let fulu_fork_slot = fulu_fork_epoch.start_slot(E::slots_per_epoch()); + + let num_blocks_produced = E::slots_per_epoch() * 8; + let harness = get_harness(store.clone(), LOW_VALIDATOR_COUNT); + + harness + .extend_chain( + num_blocks_produced as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + // Prior to manual pruning with an artifically low data availability boundary all blobs should + // be stored. + assert_eq!( + store.get_data_column_info().oldest_data_column_slot, + Some(fulu_fork_slot) + ); + check_data_column_existence(&harness, Slot::new(1), harness.head_slot(), true); + + // Trigger blob pruning of blobs older than epoch 6 - margin (6 is the minimum, due to + // finalization). + let data_availability_boundary = Epoch::new(6); + let effective_data_availability_boundary = + data_availability_boundary - store.get_config().blob_prune_margin_epochs; + assert!( + effective_data_availability_boundary > 0, + "must be > 0 because epoch 0 won't get pruned alone" + ); + store + .try_prune_blobs(true, data_availability_boundary) + .unwrap(); + + // Check oldest blob slot is updated accordingly and prior blobs have been deleted. + let oldest_data_column_slot = store + .get_data_column_info() + .oldest_data_column_slot + .unwrap(); + assert_eq!( + oldest_data_column_slot, + effective_data_availability_boundary.start_slot(E::slots_per_epoch()) + ); + check_data_column_existence(&harness, Slot::new(0), oldest_data_column_slot - 1, false); + check_data_column_existence(&harness, oldest_data_column_slot, harness.head_slot(), true); +} + +/// Check tat there are data column sidecars (or not) at every slot in the range. +fn check_data_column_existence( + harness: &TestHarness, + start_slot: Slot, + end_slot: Slot, + should_exist: bool, +) { + let mut columns_seen = 0; + for (block_root, slot) in harness + .chain + .forwards_iter_block_roots_until(start_slot, end_slot) + .unwrap() + .map(Result::unwrap) + { + if let Some(columns) = harness.chain.store.get_data_columns(&block_root).unwrap() { + assert!(should_exist, "columns at slot {slot} exist but should not"); + columns_seen += columns.len(); + } else { + // We don't actually store empty columns, so unfortunately we can't assert anything + // meaningful here (like asserting that the column should not exist). + } + } + if should_exist { + assert_ne!(columns_seen, 0, "expected non-zero number of columns"); + } +} + #[tokio::test] async fn prune_historic_states() { let num_blocks_produced = E::slots_per_epoch() * 5; @@ -4003,14 +4877,13 @@ async fn ancestor_state_root_prior_to_split() { reconstruct_historic_states: false, ..ChainConfig::default() }; - let import_all_data_columns = false; let store = get_store_generic(&db_path, store_config, spec); let harness = get_harness_generic( store.clone(), LOW_VALIDATOR_COUNT, chain_config, - import_all_data_columns, + NodeCustodyType::Fullnode, ); // Produce blocks until we have passed through two full snapshot periods. This period length is @@ -4097,14 +4970,13 @@ async fn replay_from_split_state() { reconstruct_historic_states: false, ..ChainConfig::default() }; - let import_all_data_columns = false; let store = get_store_generic(&db_path, store_config.clone(), spec.clone()); let harness = get_harness_generic( store.clone(), LOW_VALIDATOR_COUNT, chain_config, - import_all_data_columns, + NodeCustodyType::Fullnode, ); // Produce blocks until we finalize epoch 3 which will not be stored as a snapshot. @@ -4227,6 +5099,166 @@ async fn test_custody_column_filtering_supernode() { ); } +#[tokio::test] +async fn test_missing_columns_after_cgc_change() { + let spec = test_spec::(); + + let num_validators = 8; + + let num_epochs_before_increase = 4; + + let harness = BeaconChainHarness::builder(E::default()) + .spec(spec.clone().into()) + .deterministic_keypairs(num_validators) + .fresh_ephemeral_store() + .mock_execution_layer() + .build(); + + let state = harness.chain.head_beacon_state_cloned(); + + if !state.fork_name_unchecked().fulu_enabled() { + return; + } + + let custody_context = harness.chain.data_availability_checker.custody_context(); + + harness.advance_slot(); + harness + .extend_chain( + (E::slots_per_epoch() * num_epochs_before_increase) as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + let epoch_before_increase = Epoch::new(num_epochs_before_increase); + + let missing_columns = harness + .chain + .get_missing_columns_for_epoch(epoch_before_increase); + + // We should have no missing columns + assert_eq!(missing_columns.len(), 0); + + let epoch_after_increase = Epoch::new(num_epochs_before_increase + 2); + + let cgc_change_slot = epoch_before_increase.end_slot(E::slots_per_epoch()); + custody_context.register_validators(vec![(1, 32_000_000_000 * 9)], cgc_change_slot, &spec); + + harness.advance_slot(); + harness + .extend_chain( + (E::slots_per_epoch() * 5) as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + // We should have missing columns from before the cgc increase + let missing_columns = harness + .chain + .get_missing_columns_for_epoch(epoch_before_increase); + + assert!(!missing_columns.is_empty()); + + // We should have no missing columns after the cgc increase + let missing_columns = harness + .chain + .get_missing_columns_for_epoch(epoch_after_increase); + + assert!(missing_columns.is_empty()); +} + +#[tokio::test] +async fn test_safely_backfill_data_column_custody_info() { + let spec = test_spec::(); + + let num_validators = 8; + + let start_epochs = 4; + + let harness = BeaconChainHarness::builder(E::default()) + .spec(spec.clone().into()) + .deterministic_keypairs(num_validators) + .fresh_ephemeral_store() + .mock_execution_layer() + .build(); + + let state = harness.chain.head_beacon_state_cloned(); + + if !state.fork_name_unchecked().fulu_enabled() { + return; + } + + let custody_context = harness.chain.data_availability_checker.custody_context(); + + harness.advance_slot(); + harness + .extend_chain( + (E::slots_per_epoch() * start_epochs) as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + let epoch_before_increase = Epoch::new(start_epochs); + let effective_delay_slots = + CUSTODY_CHANGE_DA_EFFECTIVE_DELAY_SECONDS / harness.chain.spec.seconds_per_slot; + + let cgc_change_slot = epoch_before_increase.end_slot(E::slots_per_epoch()); + + custody_context.register_validators(vec![(1, 32_000_000_000 * 16)], cgc_change_slot, &spec); + + let epoch_after_increase = + (cgc_change_slot + effective_delay_slots).epoch(E::slots_per_epoch()); + + harness.advance_slot(); + harness + .extend_chain( + (E::slots_per_epoch() * 5) as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + let head_slot = harness.chain.head().snapshot.beacon_block.slot(); + + harness + .chain + .update_data_column_custody_info(Some(head_slot)); + + // We can only safely update custody column info 1 epoch at a time + // Skipping an epoch should return an error + harness + .chain + .safely_backfill_data_column_custody_info(head_slot.epoch(E::slots_per_epoch()) - 2) + .unwrap_err(); + + // Iterate from the head epoch back to 0 and try to backfill data column custody info + for epoch in (0..head_slot.epoch(E::slots_per_epoch()).into()).rev() { + // This is an epoch before the cgc change took into effect, we shouldnt be able to update + // without performing custody backfill sync + if epoch <= epoch_after_increase.into() { + harness + .chain + .safely_backfill_data_column_custody_info(Epoch::new(epoch)) + .unwrap_err(); + } else { + // This is an epoch after the cgc change took into effect, we should be able to update + // as long as we iterate epoch by epoch + harness + .chain + .safely_backfill_data_column_custody_info(Epoch::new(epoch)) + .unwrap(); + let earliest_available_epoch = harness + .chain + .earliest_custodied_data_column_epoch() + .unwrap(); + assert_eq!(Epoch::new(epoch), earliest_available_epoch); + } + } +} + /// Checks that two chains are the same, for the purpose of these tests. /// /// Several fields that are hard/impossible to check are ignored (e.g., the store). diff --git a/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs b/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs index 8c33cf5869..c99388287c 100644 --- a/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs +++ b/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs @@ -84,8 +84,9 @@ pub const BACKFILL_SCHEDULE_IN_SLOT: [(u32, u32); 3] = [ (4, 5), ]; -/// Trigger reconstruction if we are this many seconds into the current slot -pub const RECONSTRUCTION_DEADLINE: Duration = Duration::from_millis(3000); +/// Fraction of slot duration after which column reconstruction is triggered, makes it easier for +/// different slot timings to have a generalised deadline +pub const RECONSTRUCTION_DEADLINE: (u64, u64) = (1, 4); /// Messages that the scheduler can receive. #[derive(AsRefStr)] @@ -756,13 +757,17 @@ impl ReprocessQueue { } InboundEvent::Msg(DelayColumnReconstruction(request)) => { let mut reconstruction_delay = QUEUED_RECONSTRUCTION_DELAY; - if let Some(seconds_from_current_slot) = - self.slot_clock.seconds_from_current_slot_start() + let slot_duration = self.slot_clock.slot_duration().as_millis() as u64; + let reconstruction_deadline_millis = + (slot_duration * RECONSTRUCTION_DEADLINE.0) / RECONSTRUCTION_DEADLINE.1; + let reconstruction_deadline = Duration::from_millis(reconstruction_deadline_millis); + if let Some(duration_from_current_slot) = + self.slot_clock.millis_from_current_slot_start() && let Some(current_slot) = self.slot_clock.now() - && seconds_from_current_slot >= RECONSTRUCTION_DEADLINE + && duration_from_current_slot >= reconstruction_deadline && current_slot == request.slot { - // If we are at least `RECONSTRUCTION_DEADLINE` seconds into the current slot, + // If we are at least `reconstruction_deadline` seconds into the current slot, // and the reconstruction request is for the current slot, process reconstruction immediately. reconstruction_delay = Duration::from_secs(0); } @@ -1222,4 +1227,116 @@ mod tests { // The entry for the block root should be gone. assert!(queue.awaiting_lc_updates_per_parent_root.is_empty()); } + + async fn test_reconstruction_immediate_at_deadline(slot_duration_secs: u64) { + let config = BeaconProcessorConfig::default(); + let (ready_work_tx, _) = mpsc::channel::(config.max_scheduled_work_queue_len); + let (_, reprocess_work_rx) = + mpsc::channel::(config.max_scheduled_work_queue_len); + let slot_clock = Arc::new(testing_slot_clock(slot_duration_secs)); + let mut queue = ReprocessQueue::new(ready_work_tx, reprocess_work_rx, slot_clock); + + let slot_duration = queue.slot_clock.slot_duration(); + let reconstruction_deadline_millis = (slot_duration.as_millis() as u64 + * RECONSTRUCTION_DEADLINE.0) + / RECONSTRUCTION_DEADLINE.1; + let reconstruction_deadline = Duration::from_millis(reconstruction_deadline_millis); + + // Advance time to just after the deadline + advance_time( + &queue.slot_clock, + reconstruction_deadline + Duration::from_millis(10), + ) + .await; + + let current_slot = queue.slot_clock.now().unwrap(); + let block_root = Hash256::repeat_byte(0xaa); + + // Queue a reconstruction for the current slot after the deadline + let reconstruction_request = QueuedColumnReconstruction { + block_root, + slot: current_slot, + process_fn: Box::pin(async {}), + }; + queue.handle_message(InboundEvent::Msg( + ReprocessQueueMessage::DelayColumnReconstruction(reconstruction_request), + )); + + assert_eq!(queue.queued_column_reconstructions.len(), 1); + + // Should be immediately ready (0 delay since we're past deadline) + let ready_msg = queue.next().await.unwrap(); + assert!(matches!( + ready_msg, + InboundEvent::ReadyColumnReconstruction(_) + )); + + if let InboundEvent::ReadyColumnReconstruction(reconstruction) = ready_msg { + assert_eq!(reconstruction.block_root, block_root); + queue.handle_message(InboundEvent::ReadyColumnReconstruction(reconstruction)); + } + + assert!(queue.queued_column_reconstructions.is_empty()); + } + + /// Tests that column reconstruction queued after the deadline is triggered immediately + /// on mainnet (12s slots). + /// + /// When a reconstruction for the current slot is queued after the reconstruction deadline + /// (1/4 of slot duration = 3s for mainnet), it should be processed immediately with 0 delay. + #[tokio::test] + async fn column_reconstruction_immediate_processing_at_deadline_mainnet() { + tokio::time::pause(); + test_reconstruction_immediate_at_deadline(12).await; + } + + /// Tests that column reconstruction queued after the deadline is triggered immediately + /// on Gnosis (5s slots). + /// + /// When a reconstruction for the current slot is queued after the reconstruction deadline + /// (1/4 of slot duration = 1.25s for Gnosis), it should be processed immediately with 0 delay. + #[tokio::test] + async fn column_reconstruction_immediate_processing_at_deadline_gnosis() { + tokio::time::pause(); + test_reconstruction_immediate_at_deadline(5).await; + } + + /// Tests that column reconstruction uses the standard delay when queued before the deadline. + /// + /// When a reconstruction for the current slot is queued before the deadline, it should wait + /// for the standard QUEUED_RECONSTRUCTION_DELAY (150ms) before being triggered. + #[tokio::test] + async fn column_reconstruction_uses_standard_delay() { + tokio::time::pause(); + + let mut queue = test_queue(); + let current_slot = queue.slot_clock.now().unwrap(); + let block_root = Hash256::repeat_byte(0xcc); + + // Queue a reconstruction at the start of the slot (before deadline) + let reconstruction_request = QueuedColumnReconstruction { + block_root, + slot: current_slot, + process_fn: Box::pin(async {}), + }; + queue.handle_message(InboundEvent::Msg( + ReprocessQueueMessage::DelayColumnReconstruction(reconstruction_request), + )); + + assert_eq!(queue.queued_column_reconstructions.len(), 1); + + // Advance time by QUEUED_RECONSTRUCTION_DELAY + advance_time(&queue.slot_clock, QUEUED_RECONSTRUCTION_DELAY).await; + + // Should be ready after the standard delay + let ready_msg = queue.next().await.unwrap(); + assert!(matches!( + ready_msg, + InboundEvent::ReadyColumnReconstruction(_) + )); + + if let InboundEvent::ReadyColumnReconstruction(reconstruction) = ready_msg { + assert_eq!(reconstruction.block_root, block_root); + } + } } diff --git a/beacon_node/builder_client/Cargo.toml b/beacon_node/builder_client/Cargo.toml index 1920bd0ebb..9b1f86360d 100644 --- a/beacon_node/builder_client/Cargo.toml +++ b/beacon_node/builder_client/Cargo.toml @@ -12,3 +12,7 @@ reqwest = { workspace = true } sensitive_url = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } + +[dev-dependencies] +mockito = { workspace = true } +tokio = { workspace = true } diff --git a/beacon_node/builder_client/src/lib.rs b/beacon_node/builder_client/src/lib.rs index 2c83e34755..6b993542f3 100644 --- a/beacon_node/builder_client/src/lib.rs +++ b/beacon_node/builder_client/src/lib.rs @@ -155,15 +155,7 @@ impl BuilderHttpClient { } ContentType::Json => { self.ssz_available.store(false, Ordering::SeqCst); - let mut de = serde_json::Deserializer::from_slice(&response_bytes); - let data = - T::context_deserialize(&mut de, fork_name).map_err(Error::InvalidJson)?; - - Ok(ForkVersionedResponse { - version: fork_name, - metadata: EmptyMetadata {}, - data, - }) + serde_json::from_slice(&response_bytes).map_err(Error::InvalidJson) } } } @@ -546,6 +538,12 @@ impl BuilderHttpClient { #[cfg(test)] mod tests { use super::*; + use eth2::types::builder_bid::{BuilderBid, BuilderBidFulu}; + use eth2::types::test_utils::{SeedableRng, TestRandom, XorShiftRng}; + use eth2::types::{MainnetEthSpec, Signature}; + use mockito::{Matcher, Server, ServerGuard}; + + type E = MainnetEthSpec; #[test] fn test_headers_no_panic() { @@ -556,4 +554,146 @@ mod tests { assert!(HeaderValue::from_str(JSON_ACCEPT_VALUE).is_ok()); assert!(HeaderValue::from_str(JSON_CONTENT_TYPE_HEADER).is_ok()); } + + #[tokio::test] + async fn test_get_builder_header_ssz_response() { + // Set up mock server + let mut server = Server::new_async().await; + let mock_response_body = fulu_signed_builder_bid(); + mock_get_header_response( + &mut server, + Some("fulu"), + ContentType::Ssz, + mock_response_body.clone(), + ); + + let builder_client = BuilderHttpClient::new( + SensitiveUrl::from_str(&server.url()).unwrap(), + None, + None, + false, + ) + .unwrap(); + + let response = builder_client + .get_builder_header( + Slot::new(1), + ExecutionBlockHash::repeat_byte(1), + &PublicKeyBytes::empty(), + ) + .await + .expect("should succeed in get_builder_header") + .expect("should have response body"); + + assert_eq!(response, mock_response_body); + } + + #[tokio::test] + async fn test_get_builder_header_json_response() { + // Set up mock server + let mut server = Server::new_async().await; + let mock_response_body = fulu_signed_builder_bid(); + mock_get_header_response( + &mut server, + None, + ContentType::Json, + mock_response_body.clone(), + ); + + let builder_client = BuilderHttpClient::new( + SensitiveUrl::from_str(&server.url()).unwrap(), + None, + None, + false, + ) + .unwrap(); + + let response = builder_client + .get_builder_header( + Slot::new(1), + ExecutionBlockHash::repeat_byte(1), + &PublicKeyBytes::empty(), + ) + .await + .expect("should succeed in get_builder_header") + .expect("should have response body"); + + assert_eq!(response, mock_response_body); + } + + #[tokio::test] + async fn test_get_builder_header_no_version_header_fallback_json() { + // Set up mock server + let mut server = Server::new_async().await; + let mock_response_body = fulu_signed_builder_bid(); + mock_get_header_response( + &mut server, + Some("fulu"), + ContentType::Json, + mock_response_body.clone(), + ); + + let builder_client = BuilderHttpClient::new( + SensitiveUrl::from_str(&server.url()).unwrap(), + None, + None, + false, + ) + .unwrap(); + + let response = builder_client + .get_builder_header( + Slot::new(1), + ExecutionBlockHash::repeat_byte(1), + &PublicKeyBytes::empty(), + ) + .await + .expect("should succeed in get_builder_header") + .expect("should have response body"); + + assert_eq!(response, mock_response_body); + } + + fn mock_get_header_response( + server: &mut ServerGuard, + header_version_opt: Option<&str>, + content_type: ContentType, + response_body: ForkVersionedResponse>, + ) { + let mut mock = server.mock( + "GET", + Matcher::Regex(r"^/eth/v1/builder/header/\d+/.+/.+$".to_string()), + ); + + if let Some(version) = header_version_opt { + mock = mock.with_header(CONSENSUS_VERSION_HEADER, version); + } + + match content_type { + ContentType::Json => { + mock = mock + .with_header(CONTENT_TYPE_HEADER, JSON_CONTENT_TYPE_HEADER) + .with_body(serde_json::to_string(&response_body).unwrap()); + } + ContentType::Ssz => { + mock = mock + .with_header(CONTENT_TYPE_HEADER, SSZ_CONTENT_TYPE_HEADER) + .with_body(response_body.data.as_ssz_bytes()); + } + } + + mock.with_status(200).create(); + } + + fn fulu_signed_builder_bid() -> ForkVersionedResponse> { + let rng = &mut XorShiftRng::from_seed([42; 16]); + ForkVersionedResponse { + version: ForkName::Fulu, + metadata: EmptyMetadata {}, + data: SignedBuilderBid { + message: BuilderBid::Fulu(BuilderBidFulu::random_for_test(rng)), + signature: Signature::empty(), + }, + } + } } diff --git a/beacon_node/client/src/builder.rs b/beacon_node/client/src/builder.rs index 02c042bf28..c3c827f0aa 100644 --- a/beacon_node/client/src/builder.rs +++ b/beacon_node/client/src/builder.rs @@ -202,7 +202,7 @@ where .beacon_graffiti(beacon_graffiti) .event_handler(event_handler) .execution_layer(execution_layer) - .import_all_data_columns(config.network.subscribe_all_data_column_subnets) + .node_custody_type(config.chain.node_custody_type) .validator_monitor_config(config.validator_monitor.clone()) .rng(Box::new( StdRng::try_from_rng(&mut OsRng) diff --git a/beacon_node/client/src/notifier.rs b/beacon_node/client/src/notifier.rs index c83cdad7e0..10d9587ccc 100644 --- a/beacon_node/client/src/notifier.rs +++ b/beacon_node/client/src/notifier.rs @@ -57,6 +57,9 @@ pub fn spawn_notifier( // Store info if we are required to do a backfill sync. let original_oldest_block_slot = beacon_chain.store.get_anchor_info().oldest_block_slot; + // Use this info during custody backfill sync. + let mut original_earliest_data_column_slot = None; + let interval_future = async move { // Perform pre-genesis logging. loop { @@ -80,6 +83,7 @@ pub fn spawn_notifier( // Perform post-genesis logging. let mut last_backfill_log_slot = None; + let mut last_custody_backfill_log_slot = None; loop { // Run the notifier half way through each slot. @@ -112,6 +116,18 @@ pub fn spawn_notifier( let mut speedo = speedo.lock().await; speedo.clear(); } + (_, SyncState::CustodyBackFillSyncing { .. }) => { + // We have transitioned to a custody backfill sync. Reset the speedo. + let mut speedo = speedo.lock().await; + last_custody_backfill_log_slot = None; + speedo.clear(); + } + (SyncState::CustodyBackFillSyncing { .. }, _) => { + // We have transitioned from a custody backfill sync, reset the speedo + let mut speedo = speedo.lock().await; + last_custody_backfill_log_slot = None; + speedo.clear(); + } (_, _) => {} } current_sync_state = sync_state; @@ -154,6 +170,38 @@ pub fn spawn_notifier( Instant::now(), ); } + SyncState::CustodyBackFillSyncing { .. } => { + match beacon_chain.store.get_data_column_custody_info() { + Ok(data_column_custody_info) => { + if let Some(earliest_data_column_slot) = data_column_custody_info + .and_then(|info| info.earliest_data_column_slot) + && let Some(da_boundary) = beacon_chain.get_column_da_boundary() + { + sync_distance = earliest_data_column_slot.saturating_sub( + da_boundary.start_slot(T::EthSpec::slots_per_epoch()), + ); + + // We keep track of our starting point for custody backfill sync + // so we can measure our speed of progress. + if original_earliest_data_column_slot.is_none() { + original_earliest_data_column_slot = + Some(earliest_data_column_slot) + } + + if let Some(original_earliest_data_column_slot) = + original_earliest_data_column_slot + { + speedo.observe( + original_earliest_data_column_slot + .saturating_sub(earliest_data_column_slot), + Instant::now(), + ); + } + } + } + Err(e) => error!(error=?e, "Unable to get data column custody info"), + } + } SyncState::SyncingFinalized { .. } | SyncState::SyncingHead { .. } | SyncState::SyncTransition => { @@ -190,6 +238,8 @@ pub fn spawn_notifier( // Log if we are backfilling. let is_backfilling = matches!(current_sync_state, SyncState::BackFillSyncing { .. }); + let is_custody_backfilling = + matches!(current_sync_state, SyncState::CustodyBackFillSyncing { .. }); if is_backfilling && last_backfill_log_slot .is_none_or(|slot| slot + BACKFILL_LOG_INTERVAL <= current_slot) @@ -234,6 +284,51 @@ pub fn spawn_notifier( info!("Historical block download complete"); } + if is_custody_backfilling + && last_custody_backfill_log_slot + .is_none_or(|slot| slot + BACKFILL_LOG_INTERVAL <= current_slot) + { + last_custody_backfill_log_slot = Some(current_slot); + + let distance = format!( + "{} slots ({})", + sync_distance.as_u64(), + slot_distance_pretty(sync_distance, slot_duration) + ); + + let speed = speedo.slots_per_second(); + let display_speed = speed.is_some_and(|speed| speed != 0.0); + + if display_speed { + info!( + distance, + speed = sync_speed_pretty(speed), + est_time = + estimated_time_pretty(beacon_chain.get_column_da_boundary().and_then( + |da_boundary| speedo.estimated_time_till_slot( + da_boundary.start_slot(T::EthSpec::slots_per_epoch()) + ) + )), + "Downloading historical data columns" + ); + } else { + info!( + distance, + est_time = + estimated_time_pretty(beacon_chain.get_column_da_boundary().and_then( + |da_boundary| speedo.estimated_time_till_slot( + da_boundary.start_slot(T::EthSpec::slots_per_epoch()) + ) + )), + "Downloading historical data columns" + ); + } + } else if !is_custody_backfilling && last_custody_backfill_log_slot.is_some() { + last_custody_backfill_log_slot = None; + original_earliest_data_column_slot = None; + info!("Historical data column download complete"); + } + // Log if we are syncing if current_sync_state.is_syncing() { metrics::set_gauge(&metrics::IS_SYNCED, 0); diff --git a/beacon_node/http_api/src/lib.rs b/beacon_node/http_api/src/lib.rs index 7f6c97a0f8..f6d8dbc157 100644 --- a/beacon_node/http_api/src/lib.rs +++ b/beacon_node/http_api/src/lib.rs @@ -478,7 +478,9 @@ pub fn serve( ))) } } - SyncState::SyncTransition | SyncState::BackFillSyncing { .. } => Ok(()), + SyncState::SyncTransition + | SyncState::BackFillSyncing { .. } + | SyncState::CustodyBackFillSyncing { .. } => Ok(()), SyncState::Synced => Ok(()), SyncState::Stalled => Ok(()), } @@ -1236,8 +1238,8 @@ pub fn serve( |state_id: StateId, task_spawner: TaskSpawner, chain: Arc>| { - task_spawner.blocking_json_task(Priority::P1, move || { - let (data, execution_optimistic, finalized) = state_id + task_spawner.blocking_response_task(Priority::P1, move || { + let (data, execution_optimistic, finalized, fork_name) = state_id .map_state_and_execution_optimistic_and_finalized( &chain, |state, execution_optimistic, finalized| { @@ -1247,15 +1249,23 @@ pub fn serve( )); }; - Ok((consolidations.clone(), execution_optimistic, finalized)) + Ok(( + consolidations.clone(), + execution_optimistic, + finalized, + state.fork_name_unchecked(), + )) }, )?; - Ok(api_types::ExecutionOptimisticFinalizedResponse { + execution_optimistic_finalized_beacon_response( + ResponseIncludesVersion::Yes(fork_name), + execution_optimistic, + finalized, data, - execution_optimistic: Some(execution_optimistic), - finalized: Some(finalized), - }) + ) + .map(|res| warp::reply::json(&res).into_response()) + .map(|resp| add_consensus_version_header(resp, fork_name)) }) }, ); diff --git a/beacon_node/http_api/src/proposer_duties.rs b/beacon_node/http_api/src/proposer_duties.rs index ceac60cbad..78f99c475c 100644 --- a/beacon_node/http_api/src/proposer_duties.rs +++ b/beacon_node/http_api/src/proposer_duties.rs @@ -103,14 +103,6 @@ fn try_proposer_duties_from_cache( let head_block = &head.snapshot.beacon_block; let head_block_root = head.head_block_root(); let head_epoch = head_block.slot().epoch(T::EthSpec::slots_per_epoch()); - let head_decision_root = head - .snapshot - .beacon_state - .proposer_shuffling_decision_root(head_block_root, &chain.spec) - .map_err(warp_utils::reject::beacon_state_error)?; - let execution_optimistic = chain - .is_optimistic_or_invalid_head_block(head_block) - .map_err(warp_utils::reject::unhandled_error)?; // This code path can't handle requests for past epochs. if head_epoch > request_epoch { @@ -119,6 +111,15 @@ fn try_proposer_duties_from_cache( ))); } + let head_decision_root = head + .snapshot + .beacon_state + .proposer_shuffling_decision_root_at_epoch(request_epoch, head_block_root, &chain.spec) + .map_err(warp_utils::reject::beacon_state_error)?; + let execution_optimistic = chain + .is_optimistic_or_invalid_head_block(head_block) + .map_err(warp_utils::reject::unhandled_error)?; + chain .beacon_proposer_cache .lock() diff --git a/beacon_node/http_api/tests/interactive_tests.rs b/beacon_node/http_api/tests/interactive_tests.rs index cca17d238f..03db70e659 100644 --- a/beacon_node/http_api/tests/interactive_tests.rs +++ b/beacon_node/http_api/tests/interactive_tests.rs @@ -946,3 +946,110 @@ async fn queue_attestations_from_http() { attestation_future.await.unwrap(); } + +// Test that a request for next epoch proposer duties suceeds when the current slot clock is within +// gossip clock disparity (500ms) of the new epoch. +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn proposer_duties_with_gossip_tolerance() { + let validator_count = 24; + + let tester = InteractiveTester::::new(None, validator_count).await; + let harness = &tester.harness; + let spec = &harness.spec; + let client = &tester.client; + + let num_initial = 4 * E::slots_per_epoch() - 1; + let next_epoch_start_slot = Slot::new(num_initial + 1); + + harness.advance_slot(); + harness + .extend_chain_with_sync( + num_initial as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + SyncCommitteeStrategy::NoValidators, + LightClientStrategy::Disabled, + ) + .await; + + assert_eq!(harness.chain.slot().unwrap(), num_initial); + + // Set the clock to just before the next epoch. + harness.chain.slot_clock.advance_time( + Duration::from_secs(spec.seconds_per_slot) - spec.maximum_gossip_clock_disparity(), + ); + assert_eq!( + harness + .chain + .slot_clock + .now_with_future_tolerance(spec.maximum_gossip_clock_disparity()) + .unwrap(), + next_epoch_start_slot + ); + + let head_state = harness.get_current_state(); + let head_block_root = harness.head_block_root(); + let tolerant_current_epoch = next_epoch_start_slot.epoch(E::slots_per_epoch()); + + // This is a regression test for the bug described here: + // https://github.com/sigp/lighthouse/pull/8130/files#r2386594566 + // + // To trigger it, we need to prime the proposer shuffling cache with an incorrect entry which + // the previous code would be liable to lookup due to the bugs in its decision root calculation. + let wrong_decision_root = head_state + .proposer_shuffling_decision_root(head_block_root, spec) + .unwrap(); + let wrong_proposer_indices = vec![0; E::slots_per_epoch() as usize]; + harness + .chain + .beacon_proposer_cache + .lock() + .insert( + tolerant_current_epoch, + wrong_decision_root, + wrong_proposer_indices.clone(), + head_state.fork(), + ) + .unwrap(); + + // Request the proposer duties. + let proposer_duties_tolerant_current_epoch = client + .get_validator_duties_proposer(tolerant_current_epoch) + .await + .unwrap(); + + assert_eq!( + proposer_duties_tolerant_current_epoch.dependent_root, + head_state + .proposer_shuffling_decision_root_at_epoch( + tolerant_current_epoch, + head_block_root, + spec + ) + .unwrap() + ); + assert_ne!( + proposer_duties_tolerant_current_epoch + .data + .iter() + .map(|data| data.validator_index as usize) + .collect::>(), + wrong_proposer_indices, + ); + + // We should get the exact same result after properly advancing into the epoch. + harness + .chain + .slot_clock + .advance_time(spec.maximum_gossip_clock_disparity()); + assert_eq!(harness.chain.slot().unwrap(), next_epoch_start_slot); + let proposer_duties_current_epoch = client + .get_validator_duties_proposer(tolerant_current_epoch) + .await + .unwrap(); + + assert_eq!( + proposer_duties_tolerant_current_epoch, + proposer_duties_current_epoch + ); +} diff --git a/beacon_node/http_api/tests/tests.rs b/beacon_node/http_api/tests/tests.rs index 9c18a7c1e8..dc2fd4ae44 100644 --- a/beacon_node/http_api/tests/tests.rs +++ b/beacon_node/http_api/tests/tests.rs @@ -1,3 +1,4 @@ +use beacon_chain::custody_context::NodeCustodyType; use beacon_chain::test_utils::RelativeSyncCommittee; use beacon_chain::{ BeaconChain, ChainConfig, StateSkipConfig, WhenSlotSkipped, @@ -90,7 +91,7 @@ struct ApiTester { struct ApiTesterConfig { spec: ChainSpec, retain_historic_states: bool, - import_all_data_columns: bool, + node_custody_type: NodeCustodyType, } impl Default for ApiTesterConfig { @@ -100,7 +101,7 @@ impl Default for ApiTesterConfig { Self { spec, retain_historic_states: false, - import_all_data_columns: false, + node_custody_type: NodeCustodyType::Fullnode, } } } @@ -139,7 +140,7 @@ impl ApiTester { .deterministic_withdrawal_keypairs(VALIDATOR_COUNT) .fresh_ephemeral_store() .mock_execution_layer() - .import_all_data_columns(config.import_all_data_columns) + .node_custody_type(config.node_custody_type) .build(); harness @@ -1369,12 +1370,14 @@ impl ApiTester { .ok() .map(|(state, _execution_optimistic, _finalized)| state); - let result = self + let result = match self .client .get_beacon_states_pending_consolidations(state_id.0) .await - .unwrap() - .map(|res| res.data); + { + Ok(response) => response, + Err(e) => panic!("query failed incorrectly: {e:?}"), + }; if result.is_none() && state_opt.is_none() { continue; @@ -1383,7 +1386,12 @@ impl ApiTester { let state = state_opt.as_mut().expect("result should be none"); let expected = state.pending_consolidations().unwrap(); - assert_eq!(result.unwrap(), expected.to_vec()); + let response = result.unwrap(); + assert_eq!(response.data(), &expected.to_vec()); + + // Check that the version header is returned in the response + let fork_name = state.fork_name(&self.chain.spec).unwrap(); + assert_eq!(response.version(), Some(fork_name),); } self @@ -7835,8 +7843,7 @@ async fn get_blobs_post_fulu_supernode() { let mut config = ApiTesterConfig { retain_historic_states: false, spec: E::default_spec(), - // For supernode, we import all data columns - import_all_data_columns: true, + node_custody_type: NodeCustodyType::Supernode, }; config.spec.altair_fork_epoch = Some(Epoch::new(0)); config.spec.bellatrix_fork_epoch = Some(Epoch::new(0)); diff --git a/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs b/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs index 0ccad8d042..87337cafcf 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs @@ -247,23 +247,16 @@ impl PeerDB { .map(|(peer_id, _)| peer_id) } - /// Returns all the synced peers from the list of allowed peers that claim to have the block + /// Returns all the synced peers from the peer db that claim to have the block /// components for the given epoch based on `status.earliest_available_slot`. /// /// If `earliest_available_slot` info is not available, then return peer anyway assuming it has the /// required data. - /// - /// If `allowed_peers` is `Some`, then filters for the epoch only for those peers. - pub fn synced_peers_for_epoch<'a>( - &'a self, - epoch: Epoch, - allowed_peers: Option<&'a HashSet>, - ) -> impl Iterator { + pub fn synced_peers_for_epoch(&self, epoch: Epoch) -> impl Iterator { self.peers .iter() - .filter(move |(peer_id, info)| { - allowed_peers.is_none_or(|allowed| allowed.contains(peer_id)) - && info.is_connected() + .filter(move |(_, info)| { + info.is_connected() && match info.sync_status() { SyncStatus::Synced { info } => { info.has_slot(epoch.end_slot(E::slots_per_epoch())) diff --git a/beacon_node/lighthouse_network/src/service/api_types.rs b/beacon_node/lighthouse_network/src/service/api_types.rs index 0f5fd99c27..f1a4d87de7 100644 --- a/beacon_node/lighthouse_network/src/service/api_types.rs +++ b/beacon_node/lighthouse_network/src/service/api_types.rs @@ -60,8 +60,8 @@ pub struct BlobsByRangeRequestId { pub struct DataColumnsByRangeRequestId { /// Id to identify this attempt at a data_columns_by_range request for `parent_request_id` pub id: Id, - /// The Id of the overall By Range request for block components. - pub parent_request_id: ComponentsByRangeRequestId, + /// The Id of the overall By Range request for either a components by range request or a custody backfill request. + pub parent_request_id: DataColumnsByRangeRequester, /// The peer id associated with the request. /// /// This is useful to penalize the peer at a later point if it returned data columns that @@ -69,6 +69,12 @@ pub struct DataColumnsByRangeRequestId { pub peer: PeerId, } +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] +pub enum DataColumnsByRangeRequester { + ComponentsByRange(ComponentsByRangeRequestId), + CustodyBackfillSync(CustodyBackFillBatchRequestId), +} + /// Block components by range request for range sync. Includes an ID for downstream consumers to /// handle retries and tie all their sub requests together. #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] @@ -80,6 +86,24 @@ pub struct ComponentsByRangeRequestId { pub requester: RangeRequestId, } +/// A batch of data columns by range request for custody sync. Includes an ID for downstream consumers to +/// handle retries and tie all the range requests for the given epoch together. +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] +pub struct CustodyBackFillBatchRequestId { + /// For each `epoch` we may request the same data in a later retry. This Id identifies the + /// current attempt. + pub id: Id, + pub batch_id: CustodyBackfillBatchId, +} + +/// Custody backfill may be restarted and sync each epoch multiple times in different runs. Identify +/// each batch by epoch and run_id for uniqueness. +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] +pub struct CustodyBackfillBatchId { + pub epoch: Epoch, + pub run_id: u64, +} + /// Range sync chain or backfill batch #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub enum RangeRequestId { @@ -217,6 +241,8 @@ impl_display!(ComponentsByRangeRequestId, "{}/{}", id, requester); impl_display!(DataColumnsByRootRequestId, "{}/{}", id, requester); impl_display!(SingleLookupReqId, "{}/Lookup/{}", req_id, lookup_id); impl_display!(CustodyId, "{}", requester); +impl_display!(CustodyBackFillBatchRequestId, "{}/{}", id, batch_id); +impl_display!(CustodyBackfillBatchId, "{}/{}", epoch, run_id); impl Display for DataColumnsByRootRequester { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { @@ -241,6 +267,15 @@ impl Display for RangeRequestId { } } +impl Display for DataColumnsByRangeRequester { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::ComponentsByRange(id) => write!(f, "ByRange/{id}"), + Self::CustodyBackfillSync(id) => write!(f, "CustodyBackfill/{id}"), + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -263,15 +298,17 @@ mod tests { fn display_id_data_columns_by_range() { let id = DataColumnsByRangeRequestId { id: 123, - parent_request_id: ComponentsByRangeRequestId { - id: 122, - requester: RangeRequestId::RangeSync { - chain_id: 54, - batch_id: Epoch::new(0), + parent_request_id: DataColumnsByRangeRequester::ComponentsByRange( + ComponentsByRangeRequestId { + id: 122, + requester: RangeRequestId::RangeSync { + chain_id: 54, + batch_id: Epoch::new(0), + }, }, - }, + ), peer: PeerId::random(), }; - assert_eq!(format!("{id}"), "123/122/RangeSync/0/54"); + assert_eq!(format!("{id}"), "123/ByRange/122/RangeSync/0/54"); } } diff --git a/beacon_node/lighthouse_network/src/types/globals.rs b/beacon_node/lighthouse_network/src/types/globals.rs index b8c34f8392..2a3571c3b7 100644 --- a/beacon_node/lighthouse_network/src/types/globals.rs +++ b/beacon_node/lighthouse_network/src/types/globals.rs @@ -4,6 +4,7 @@ use crate::peer_manager::peerdb::PeerDB; use crate::rpc::{MetaData, MetaDataV3}; use crate::types::{BackFillState, SyncState}; use crate::{Client, Enr, GossipTopic, Multiaddr, NetworkConfig, PeerId}; +use eth2::lighthouse::sync_state::CustodyBackFillState; use network_utils::enr_ext::EnrExt; use parking_lot::RwLock; use std::collections::HashSet; @@ -29,6 +30,8 @@ pub struct NetworkGlobals { pub sync_state: RwLock, /// The current state of the backfill sync. pub backfill_state: RwLock, + /// The current state of custody sync. + pub custody_sync_state: RwLock, /// The computed sampling subnets and columns is stored to avoid re-computing. pub sampling_subnets: RwLock>, /// Network-related configuration. Immutable after initialization. @@ -91,6 +94,9 @@ impl NetworkGlobals { gossipsub_subscriptions: RwLock::new(HashSet::new()), sync_state: RwLock::new(SyncState::Stalled), backfill_state: RwLock::new(BackFillState::Paused), + custody_sync_state: RwLock::new(CustodyBackFillState::Pending( + "Custody backfill sync initialized".to_string(), + )), sampling_subnets: RwLock::new(sampling_subnets), config, spec, diff --git a/beacon_node/lighthouse_network/src/types/mod.rs b/beacon_node/lighthouse_network/src/types/mod.rs index 0bbbcebaf2..3f57406fc7 100644 --- a/beacon_node/lighthouse_network/src/types/mod.rs +++ b/beacon_node/lighthouse_network/src/types/mod.rs @@ -10,7 +10,7 @@ pub type EnrSyncCommitteeBitfield = BitVector<::SyncCommitteeSu pub type Enr = discv5::enr::Enr; -pub use eth2::lighthouse::sync_state::{BackFillState, SyncState}; +pub use eth2::lighthouse::sync_state::{BackFillState, CustodyBackFillState, SyncState}; pub use globals::NetworkGlobals; pub use pubsub::{PubsubMessage, SnappyTransform}; pub use subnet::{Subnet, SubnetDiscovery}; diff --git a/beacon_node/lighthouse_tracing/src/lib.rs b/beacon_node/lighthouse_tracing/src/lib.rs index 18a9874252..56dccadaa9 100644 --- a/beacon_node/lighthouse_tracing/src/lib.rs +++ b/beacon_node/lighthouse_tracing/src/lib.rs @@ -26,7 +26,9 @@ pub const SPAN_PROCESS_RPC_BLOCK: &str = "process_rpc_block"; pub const SPAN_PROCESS_RPC_BLOBS: &str = "process_rpc_blobs"; pub const SPAN_PROCESS_RPC_CUSTODY_COLUMNS: &str = "process_rpc_custody_columns"; pub const SPAN_PROCESS_CHAIN_SEGMENT: &str = "process_chain_segment"; +pub const SPAN_CUSTODY_BACKFILL_SYNC_BATCH_REQUEST: &str = "custody_backfill_sync_batch_request"; pub const SPAN_PROCESS_CHAIN_SEGMENT_BACKFILL: &str = "process_chain_segment_backfill"; +pub const SPAN_CUSTODY_BACKFILL_SYNC_IMPORT_COLUMNS: &str = "custody_backfill_sync_import_columns"; /// Fork choice root spans pub const SPAN_RECOMPUTE_HEAD: &str = "recompute_head_at_slot"; @@ -73,4 +75,6 @@ pub const LH_BN_ROOT_SPAN_NAMES: &[&str] = &[ SPAN_HANDLE_LIGHT_CLIENT_BOOTSTRAP, SPAN_HANDLE_LIGHT_CLIENT_OPTIMISTIC_UPDATE, SPAN_HANDLE_LIGHT_CLIENT_FINALITY_UPDATE, + SPAN_CUSTODY_BACKFILL_SYNC_BATCH_REQUEST, + SPAN_CUSTODY_BACKFILL_SYNC_IMPORT_COLUMNS, ]; diff --git a/beacon_node/network/src/metrics.rs b/beacon_node/network/src/metrics.rs index a2b5af8b08..cea06a28c8 100644 --- a/beacon_node/network/src/metrics.rs +++ b/beacon_node/network/src/metrics.rs @@ -212,6 +212,22 @@ pub static BEACON_PROCESSOR_RPC_BLOCK_IMPORTED_TOTAL: LazyLock, +> = LazyLock::new(|| { + try_create_int_counter( + "beacon_processor_custody_backfill_column_import_success_total", + "Total number of custody backfill sync columns successfully processed.", + ) +}); +pub static BEACON_PROCESSOR_CUSTODY_BACKFILL_BATCH_FAILED_TOTAL: LazyLock> = + LazyLock::new(|| { + try_create_int_counter( + "beacon_processor_custody_backfill_batch_failed_total", + "Total number of custody backfill batches that failed to be processed.", + ) + }); // Chain segments. pub static BEACON_PROCESSOR_CHAIN_SEGMENT_SUCCESS_TOTAL: LazyLock> = LazyLock::new(|| { diff --git a/beacon_node/network/src/network_beacon_processor/mod.rs b/beacon_node/network/src/network_beacon_processor/mod.rs index 85ccde1d59..7441e92871 100644 --- a/beacon_node/network/src/network_beacon_processor/mod.rs +++ b/beacon_node/network/src/network_beacon_processor/mod.rs @@ -16,6 +16,7 @@ use lighthouse_network::rpc::methods::{ BlobsByRangeRequest, BlobsByRootRequest, DataColumnsByRangeRequest, DataColumnsByRootRequest, LightClientUpdatesByRangeRequest, }; +use lighthouse_network::service::api_types::CustodyBackfillBatchId; use lighthouse_network::{ Client, MessageId, NetworkGlobals, PeerId, PubsubMessage, rpc::{BlocksByRangeRequest, BlocksByRootRequest, LightClientBootstrapRequest, StatusMessage}, @@ -492,6 +493,22 @@ impl NetworkBeaconProcessor { }) } + pub fn send_historic_data_columns( + self: &Arc, + batch_id: CustodyBackfillBatchId, + data_columns: DataColumnSidecarList, + ) -> Result<(), Error> { + let processor = self.clone(); + let process_fn = move || processor.process_historic_data_columns(batch_id, data_columns); + + let work = Work::ChainSegmentBackfill(Box::new(process_fn)); + + self.try_send(BeaconWorkEvent { + drop_during_sync: true, + work, + }) + } + /// Create a new work event to import `blocks` as a beacon chain segment. pub fn send_chain_segment( self: &Arc, diff --git a/beacon_node/network/src/network_beacon_processor/rpc_methods.rs b/beacon_node/network/src/network_beacon_processor/rpc_methods.rs index 58e02ffe00..a81595322b 100644 --- a/beacon_node/network/src/network_beacon_processor/rpc_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/rpc_methods.rs @@ -3,7 +3,7 @@ use crate::network_beacon_processor::{FUTURE_SLOT_TOLERANCE, NetworkBeaconProces use crate::service::NetworkMessage; use crate::status::ToStatusMessage; use crate::sync::SyncMessage; -use beacon_chain::{BeaconChainError, BeaconChainTypes, WhenSlotSkipped}; +use beacon_chain::{BeaconChainError, BeaconChainTypes, BlockProcessStatus, WhenSlotSkipped}; use itertools::{Itertools, process_results}; use lighthouse_network::rpc::methods::{ BlobsByRangeRequest, BlobsByRootRequest, DataColumnsByRangeRequest, DataColumnsByRootRequest, @@ -293,21 +293,49 @@ impl NetworkBeaconProcessor { inbound_request_id: InboundRequestId, request: BlobsByRootRequest, ) -> Result<(), (RpcErrorResponse, &'static str)> { - let Some(requested_root) = request.blob_ids.as_slice().first().map(|id| id.block_root) - else { - // No blob ids requested. - return Ok(()); - }; - let requested_indices = request - .blob_ids - .as_slice() - .iter() - .map(|id| id.index) - .collect::>(); let mut send_blob_count = 0; + let fulu_start_slot = self + .chain + .spec + .fulu_fork_epoch + .map(|epoch| epoch.start_slot(T::EthSpec::slots_per_epoch())); + let mut blob_list_results = HashMap::new(); + + let slots_by_block_root: HashMap = request + .blob_ids + .iter() + .flat_map(|blob_id| { + let block_root = blob_id.block_root; + self.chain + .data_availability_checker + .get_cached_block(&block_root) + .and_then(|status| match status { + BlockProcessStatus::NotValidated(block, _source) => Some(block), + BlockProcessStatus::ExecutionValidated(block) => Some(block), + BlockProcessStatus::Unknown => None, + }) + .or_else(|| self.chain.early_attester_cache.get_block(block_root)) + .map(|block| (block_root, block.slot())) + }) + .collect(); + for id in request.blob_ids.as_slice() { + let BlobIdentifier { + block_root: root, + index, + } = id; + + let slot = slots_by_block_root.get(root); + + // Skip if slot is >= fulu_start_slot + if let (Some(slot), Some(fulu_slot)) = (slot, fulu_start_slot) + && *slot >= fulu_slot + { + continue; + } + // First attempt to get the blobs from the RPC cache. if let Ok(Some(blob)) = self.chain.data_availability_checker.get_blob(id) { self.send_response( @@ -317,11 +345,6 @@ impl NetworkBeaconProcessor { ); send_blob_count += 1; } else { - let BlobIdentifier { - block_root: root, - index, - } = id; - let blob_list_result = match blob_list_results.entry(root) { Entry::Vacant(entry) => { entry.insert(self.chain.get_blobs_checking_early_attester_cache(root)) @@ -331,16 +354,15 @@ impl NetworkBeaconProcessor { match blob_list_result.as_ref() { Ok(blobs_sidecar_list) => { - 'inner: for blob_sidecar in blobs_sidecar_list.iter() { - if blob_sidecar.index == *index { - self.send_response( - peer_id, - inbound_request_id, - Response::BlobsByRoot(Some(blob_sidecar.clone())), - ); - send_blob_count += 1; - break 'inner; - } + if let Some(blob_sidecar) = + blobs_sidecar_list.iter().find(|b| b.index == *index) + { + self.send_response( + peer_id, + inbound_request_id, + Response::BlobsByRoot(Some(blob_sidecar.clone())), + ); + send_blob_count += 1; } } Err(e) => { @@ -354,10 +376,10 @@ impl NetworkBeaconProcessor { } } } + debug!( %peer_id, - %requested_root, - ?requested_indices, + block_root = ?slots_by_block_root.keys(), returned = send_blob_count, "BlobsByRoot outgoing response processed" ); @@ -1003,6 +1025,34 @@ impl NetworkBeaconProcessor { ); let request_start_slot = Slot::from(req.start_slot); + let request_start_epoch = request_start_slot.epoch(T::EthSpec::slots_per_epoch()); + let fork_name = self.chain.spec.fork_name_at_epoch(request_start_epoch); + // Should not send more than max request blob sidecars + if req.max_blobs_requested(request_start_epoch, &self.chain.spec) + > self.chain.spec.max_request_blob_sidecars(fork_name) as u64 + { + return Err(( + RpcErrorResponse::InvalidRequest, + "Request exceeded `MAX_REQUEST_BLOBS_SIDECARS`", + )); + } + + let effective_count = if let Some(fulu_epoch) = self.chain.spec.fulu_fork_epoch { + let fulu_start_slot = fulu_epoch.start_slot(T::EthSpec::slots_per_epoch()); + let request_end_slot = request_start_slot.saturating_add(req.count) - 1; + + // If the request_start_slot is at or after a Fulu slot, return an empty response + if request_start_slot >= fulu_start_slot { + return Ok(()); + // For the case that the request slots spans across the Fulu fork slot + } else if request_end_slot >= fulu_start_slot { + (fulu_start_slot - request_start_slot).as_u64() + } else { + req.count + } + } else { + req.count + }; let data_availability_boundary_slot = match self.chain.data_availability_boundary() { Some(boundary) => boundary.start_slot(T::EthSpec::slots_per_epoch()), @@ -1040,7 +1090,7 @@ impl NetworkBeaconProcessor { } let block_roots = - self.get_block_roots_for_slot_range(req.start_slot, req.count, "BlobsByRange")?; + self.get_block_roots_for_slot_range(req.start_slot, effective_count, "BlobsByRange")?; let current_slot = self .chain @@ -1067,7 +1117,7 @@ impl NetworkBeaconProcessor { // Due to skip slots, blobs could be out of the range, we ensure they // are in the range before sending if blob_sidecar.slot() >= request_start_slot - && blob_sidecar.slot() < request_start_slot + req.count + && blob_sidecar.slot() < request_start_slot + effective_count { blobs_sent += 1; self.send_network_message(NetworkMessage::SendResponse { @@ -1148,39 +1198,48 @@ impl NetworkBeaconProcessor { if req.max_requested::() > self.chain.spec.max_request_data_column_sidecars { return Err(( RpcErrorResponse::InvalidRequest, - "Request exceeded `MAX_REQUEST_BLOBS_SIDECARS`", + "Request exceeded `MAX_REQUEST_DATA_COLUMN_SIDECARS`", )); } let request_start_slot = Slot::from(req.start_slot); - let data_availability_boundary_slot = match self.chain.data_availability_boundary() { - Some(boundary) => boundary.start_slot(T::EthSpec::slots_per_epoch()), - None => { - debug!("Deneb fork is disabled"); - return Err((RpcErrorResponse::InvalidRequest, "Deneb fork is disabled")); - } - }; + let column_data_availability_boundary_slot = + match self.chain.column_data_availability_boundary() { + Some(boundary) => boundary.start_slot(T::EthSpec::slots_per_epoch()), + None => { + debug!("Fulu fork is disabled"); + return Err((RpcErrorResponse::InvalidRequest, "Fulu fork is disabled")); + } + }; - let oldest_data_column_slot = self - .chain - .store - .get_data_column_info() - .oldest_data_column_slot - .unwrap_or(data_availability_boundary_slot); + let earliest_custodied_data_column_slot = + match self.chain.earliest_custodied_data_column_epoch() { + Some(earliest_custodied_epoch) => { + let earliest_custodied_slot = + earliest_custodied_epoch.start_slot(T::EthSpec::slots_per_epoch()); + // Ensure the earliest columns we serve are within the data availability window + if earliest_custodied_slot < column_data_availability_boundary_slot { + column_data_availability_boundary_slot + } else { + earliest_custodied_slot + } + } + None => column_data_availability_boundary_slot, + }; - if request_start_slot < oldest_data_column_slot { + if request_start_slot < earliest_custodied_data_column_slot { debug!( %request_start_slot, - %oldest_data_column_slot, - %data_availability_boundary_slot, - "Range request start slot is older than data availability boundary." + %earliest_custodied_data_column_slot, + %column_data_availability_boundary_slot, + "Range request start slot is older than the earliest custodied data column slot." ); - return if data_availability_boundary_slot < oldest_data_column_slot { + return if earliest_custodied_data_column_slot > column_data_availability_boundary_slot { Err(( RpcErrorResponse::ResourceUnavailable, - "blobs pruned within boundary", + "columns pruned within boundary", )) } else { Err(( diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index 1d99540c29..41b12fa01b 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -1,6 +1,7 @@ use crate::metrics::{self, register_process_result_metrics}; use crate::network_beacon_processor::{FUTURE_SLOT_TOLERANCE, NetworkBeaconProcessor}; use crate::sync::BatchProcessResult; +use crate::sync::manager::CustodyBatchProcessResult; use crate::sync::{ ChainId, manager::{BlockProcessType, SyncMessage}, @@ -8,6 +9,7 @@ use crate::sync::{ use beacon_chain::block_verification_types::{AsBlock, RpcBlock}; use beacon_chain::data_availability_checker::AvailabilityCheckError; use beacon_chain::data_availability_checker::MaybeAvailableBlock; +use beacon_chain::historical_data_columns::HistoricalDataColumnError; use beacon_chain::{ AvailabilityProcessingStatus, BeaconChainTypes, BlockError, ChainSegmentResult, HistoricalBlockError, NotifyExecutionLayer, validator_monitor::get_slot_delay_ms, @@ -18,15 +20,17 @@ use beacon_processor::{ }; use beacon_processor::{Work, WorkEvent}; use lighthouse_network::PeerAction; +use lighthouse_network::service::api_types::CustodyBackfillBatchId; use lighthouse_tracing::{ - SPAN_PROCESS_CHAIN_SEGMENT, SPAN_PROCESS_CHAIN_SEGMENT_BACKFILL, SPAN_PROCESS_RPC_BLOBS, - SPAN_PROCESS_RPC_BLOCK, SPAN_PROCESS_RPC_CUSTODY_COLUMNS, + SPAN_CUSTODY_BACKFILL_SYNC_IMPORT_COLUMNS, SPAN_PROCESS_CHAIN_SEGMENT, + SPAN_PROCESS_CHAIN_SEGMENT_BACKFILL, SPAN_PROCESS_RPC_BLOBS, SPAN_PROCESS_RPC_BLOCK, + SPAN_PROCESS_RPC_CUSTODY_COLUMNS, }; use logging::crit; use std::sync::Arc; use std::time::Duration; use store::KzgCommitment; -use tracing::{debug, error, info, instrument, warn}; +use tracing::{debug, debug_span, error, info, instrument, warn}; use types::beacon_block_body::format_kzg_commitments; use types::blob_sidecar::FixedBlobSidecarList; use types::{BlockImportSource, DataColumnSidecarList, Epoch, Hash256}; @@ -418,6 +422,103 @@ impl NetworkBeaconProcessor { }); } + pub fn process_historic_data_columns( + &self, + batch_id: CustodyBackfillBatchId, + downloaded_columns: DataColumnSidecarList, + ) { + let _guard = debug_span!( + SPAN_CUSTODY_BACKFILL_SYNC_IMPORT_COLUMNS, + epoch = %batch_id.epoch, + columns_received_count = downloaded_columns.len() + ) + .entered(); + + let sent_columns = downloaded_columns.len(); + let result = match self + .chain + .import_historical_data_column_batch(batch_id.epoch, downloaded_columns) + { + Ok(imported_columns) => { + metrics::inc_counter_by( + &metrics::BEACON_PROCESSOR_CUSTODY_BACKFILL_COLUMN_IMPORT_SUCCESS_TOTAL, + imported_columns as u64, + ); + CustodyBatchProcessResult::Success { + sent_columns, + imported_columns, + } + } + Err(e) => { + metrics::inc_counter( + &metrics::BEACON_PROCESSOR_CUSTODY_BACKFILL_BATCH_FAILED_TOTAL, + ); + let peer_action: Option = match &e { + HistoricalDataColumnError::NoBlockFound { + data_column_block_root, + expected_block_root, + } => { + debug!( + error = "no_block_found", + ?data_column_block_root, + ?expected_block_root, + "Custody backfill batch processing error" + ); + // The peer is faulty if they send blocks with bad roots. + Some(PeerAction::LowToleranceError) + } + HistoricalDataColumnError::MissingDataColumns { .. } => { + warn!( + error = ?e, + "Custody backfill batch processing error", + ); + // The peer is faulty if they don't return data columns + // that they advertised as available. + Some(PeerAction::LowToleranceError) + } + HistoricalDataColumnError::InvalidKzg => { + warn!( + error = ?e, + "Custody backfill batch processing error", + ); + // The peer is faulty if they don't return data columns + // with valid kzg commitments. + Some(PeerAction::LowToleranceError) + } + HistoricalDataColumnError::BeaconChainError(e) => { + match &**e { + beacon_chain::BeaconChainError::FailedColumnCustodyInfoUpdate => {} + _ => { + warn!( + error = ?e, + "Custody backfill batch processing error", + ); + } + } + + // This is an interal error, don't penalize the peer + None + } + HistoricalDataColumnError::IndexOutOfBounds => { + error!( + error = ?e, + "Custody backfill batch out of bounds error" + ); + // This should never occur, don't penalize the peer. + None + } + HistoricalDataColumnError::StoreError(e) => { + warn!(error = ?e, "Custody backfill batch processing error"); + // This is an internal error, don't penalize the peer. + None + } + }; + CustodyBatchProcessResult::Error { peer_action } + } + }; + self.send_sync_message(SyncMessage::CustodyBatchProcessed { result, batch_id }); + } + /// Attempt to import the chain segment (`blocks`) to the beacon chain, informing the sync /// thread if more blocks are needed to process it. #[instrument( diff --git a/beacon_node/network/src/network_beacon_processor/tests.rs b/beacon_node/network/src/network_beacon_processor/tests.rs index 4137c974bf..a9794cb5c4 100644 --- a/beacon_node/network/src/network_beacon_processor/tests.rs +++ b/beacon_node/network/src/network_beacon_processor/tests.rs @@ -9,6 +9,7 @@ use crate::{ sync::{SyncMessage, manager::BlockProcessType}, }; use beacon_chain::block_verification_types::RpcBlock; +use beacon_chain::custody_context::NodeCustodyType; use beacon_chain::data_column_verification::validate_data_column_sidecar_for_gossip; use beacon_chain::kzg_utils::blobs_to_data_column_sidecars; use beacon_chain::observed_data_sidecars::DoNotObserve; @@ -22,7 +23,7 @@ use gossipsub::MessageAcceptance; use itertools::Itertools; use lighthouse_network::rpc::InboundRequestId; use lighthouse_network::rpc::methods::{ - BlobsByRangeRequest, DataColumnsByRangeRequest, MetaDataV3, + BlobsByRangeRequest, BlobsByRootRequest, DataColumnsByRangeRequest, MetaDataV3, }; use lighthouse_network::{ Client, MessageId, NetworkConfig, NetworkGlobals, PeerId, Response, @@ -37,12 +38,12 @@ use std::iter::Iterator; use std::sync::Arc; use std::time::Duration; use tokio::sync::mpsc; -use types::blob_sidecar::FixedBlobSidecarList; +use types::blob_sidecar::{BlobIdentifier, FixedBlobSidecarList}; use types::{ AttesterSlashing, BlobSidecar, BlobSidecarList, ChainSpec, DataColumnSidecarList, DataColumnSubnetId, Epoch, EthSpec, Hash256, MainnetEthSpec, ProposerSlashing, - SignedAggregateAndProof, SignedBeaconBlock, SignedVoluntaryExit, SingleAttestation, Slot, - SubnetId, + RuntimeVariableList, SignedAggregateAndProof, SignedBeaconBlock, SignedVoluntaryExit, + SingleAttestation, Slot, SubnetId, }; type E = MainnetEthSpec; @@ -94,20 +95,32 @@ impl TestRig { // This allows for testing voluntary exits without building out a massive chain. let mut spec = test_spec::(); spec.shard_committee_period = 2; - Self::new_parametric(chain_length, BeaconProcessorConfig::default(), false, spec).await + Self::new_parametric( + chain_length, + BeaconProcessorConfig::default(), + NodeCustodyType::Fullnode, + spec, + ) + .await } pub async fn new_supernode(chain_length: u64) -> Self { // This allows for testing voluntary exits without building out a massive chain. let mut spec = test_spec::(); spec.shard_committee_period = 2; - Self::new_parametric(chain_length, BeaconProcessorConfig::default(), true, spec).await + Self::new_parametric( + chain_length, + BeaconProcessorConfig::default(), + NodeCustodyType::Supernode, + spec, + ) + .await } pub async fn new_parametric( chain_length: u64, beacon_processor_config: BeaconProcessorConfig, - import_data_columns: bool, + node_custody_type: NodeCustodyType, spec: ChainSpec, ) -> Self { let spec = Arc::new(spec); @@ -116,7 +129,7 @@ impl TestRig { .deterministic_keypairs(VALIDATOR_COUNT) .fresh_ephemeral_store() .mock_execution_layer() - .import_all_data_columns(import_data_columns) + .node_custody_type(node_custody_type) .chain_config(<_>::default()) .build(); @@ -431,15 +444,22 @@ impl TestRig { } } - pub fn enqueue_blobs_by_range_request(&self, count: u64) { + pub fn enqueue_blobs_by_range_request(&self, start_slot: u64, count: u64) { self.network_beacon_processor .send_blobs_by_range_request( PeerId::random(), InboundRequestId::new_unchecked(42, 24), - BlobsByRangeRequest { - start_slot: 0, - count, - }, + BlobsByRangeRequest { start_slot, count }, + ) + .unwrap(); + } + + pub fn enqueue_blobs_by_root_request(&self, blob_ids: RuntimeVariableList) { + self.network_beacon_processor + .send_blobs_by_roots_request( + PeerId::random(), + InboundRequestId::new_unchecked(42, 24), + BlobsByRootRequest { blob_ids }, ) .unwrap(); } @@ -1603,7 +1623,7 @@ async fn test_backfill_sync_processing_rate_limiting_disabled() { let mut rig = TestRig::new_parametric( SMALL_CHAIN, beacon_processor_config, - false, + NodeCustodyType::Fullnode, test_spec::(), ) .await; @@ -1632,8 +1652,9 @@ async fn test_blobs_by_range() { return; }; let mut rig = TestRig::new(64).await; + let start_slot = 0; let slot_count = 32; - rig.enqueue_blobs_by_range_request(slot_count); + rig.enqueue_blobs_by_range_request(start_slot, slot_count); let mut blob_count = 0; for slot in 0..slot_count { @@ -1651,6 +1672,71 @@ async fn test_blobs_by_range() { .unwrap_or(0); } let mut actual_count = 0; + while let Some(next) = rig.network_rx.recv().await { + if let NetworkMessage::SendResponse { + peer_id: _, + response: Response::BlobsByRange(blob), + inbound_request_id: _, + } = next + { + if blob.is_some() { + actual_count += 1; + } else { + break; + } + } else { + panic!("unexpected message {:?}", next); + } + } + if test_spec::().fulu_fork_epoch.is_some() { + assert_eq!(0, actual_count, "Post-Fulu should return 0 blobs"); + } else { + assert_eq!(blob_count, actual_count); + } +} + +#[tokio::test] +async fn test_blobs_by_range_spans_fulu_fork() { + // Only test for Electra & Fulu fork transition + if test_spec::().electra_fork_epoch.is_none() { + return; + }; + let mut spec = test_spec::(); + spec.fulu_fork_epoch = Some(Epoch::new(1)); + spec.gloas_fork_epoch = Some(Epoch::new(2)); + + let mut rig = TestRig::new_parametric( + 64, + BeaconProcessorConfig::default(), + NodeCustodyType::Fullnode, + spec, + ) + .await; + + let start_slot = 16; + // This will span from epoch 0 (Electra) to epoch 1 (Fulu) + let slot_count = 32; + + rig.enqueue_blobs_by_range_request(start_slot, slot_count); + + let mut blob_count = 0; + for slot in start_slot..slot_count { + let root = rig + .chain + .block_root_at_slot(Slot::new(slot), WhenSlotSkipped::None) + .unwrap(); + blob_count += root + .map(|root| { + rig.chain + .get_blobs(&root) + .map(|list| list.len()) + .unwrap_or(0) + }) + .unwrap_or(0); + } + + let mut actual_count = 0; + while let Some(next) = rig.network_rx.recv().await { if let NetworkMessage::SendResponse { peer_id: _, @@ -1670,6 +1756,116 @@ async fn test_blobs_by_range() { assert_eq!(blob_count, actual_count); } +#[tokio::test] +async fn test_blobs_by_root() { + if test_spec::().deneb_fork_epoch.is_none() { + return; + }; + + let mut rig = TestRig::new(64).await; + + // Get the block root of a sample slot, e.g., slot 1 + let block_root = rig + .chain + .block_root_at_slot(Slot::new(1), WhenSlotSkipped::None) + .unwrap() + .unwrap(); + + let blobs = rig.chain.get_blobs(&block_root).unwrap(); + let blob_count = blobs.len(); + + let blob_ids: Vec = (0..blob_count) + .map(|index| BlobIdentifier { + block_root, + index: index as u64, + }) + .collect(); + + let blob_ids_list = RuntimeVariableList::new(blob_ids, blob_count).unwrap(); + + rig.enqueue_blobs_by_root_request(blob_ids_list); + + let mut blob_count = 0; + let root = rig + .chain + .block_root_at_slot(Slot::new(1), WhenSlotSkipped::None) + .unwrap(); + blob_count += root + .map(|root| { + rig.chain + .get_blobs(&root) + .map(|list| list.len()) + .unwrap_or(0) + }) + .unwrap_or(0); + + let mut actual_count = 0; + + while let Some(next) = rig.network_rx.recv().await { + if let NetworkMessage::SendResponse { + peer_id: _, + response: Response::BlobsByRoot(blob), + inbound_request_id: _, + } = next + { + if blob.is_some() { + actual_count += 1; + } else { + break; + } + } else { + panic!("unexpected message {:?}", next); + } + } + assert_eq!(blob_count, actual_count); +} + +#[tokio::test] +async fn test_blobs_by_root_post_fulu_should_return_empty() { + // Only test for Fulu fork + if test_spec::().fulu_fork_epoch.is_none() { + return; + }; + + let mut rig = TestRig::new(64).await; + + let block_root = rig + .chain + .block_root_at_slot(Slot::new(1), WhenSlotSkipped::None) + .unwrap() + .unwrap(); + + let blob_ids = vec![BlobIdentifier { + block_root, + index: 0, + }]; + + let blob_ids_list = RuntimeVariableList::new(blob_ids, 1).unwrap(); + + rig.enqueue_blobs_by_root_request(blob_ids_list); + + let mut actual_count = 0; + + while let Some(next) = rig.network_rx.recv().await { + if let NetworkMessage::SendResponse { + peer_id: _, + response: Response::BlobsByRoot(blob), + inbound_request_id: _, + } = next + { + if blob.is_some() { + actual_count += 1; + } else { + break; + } + } else { + panic!("unexpected message {:?}", next); + } + } + // Post-Fulu should return 0 blobs + assert_eq!(0, actual_count); +} + /// Ensure that data column processing that results in block import sends a sync notification #[tokio::test] async fn test_data_column_import_notifies_sync() { diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index d5a4e9b73a..6c0cbd7e55 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -9,24 +9,27 @@ //! sync as failed, log an error and attempt to retry once a new peer joins the node. use crate::network_beacon_processor::ChainSegmentProcessId; +use crate::sync::batch::{ + BatchConfig, BatchId, BatchInfo, BatchOperationOutcome, BatchProcessingResult, BatchState, +}; use crate::sync::block_sidecar_coupling::CouplingError; use crate::sync::manager::BatchProcessResult; use crate::sync::network_context::{ RangeRequestId, RpcRequestSendError, RpcResponseError, SyncNetworkContext, }; -use crate::sync::range_sync::{ - BatchConfig, BatchId, BatchInfo, BatchOperationOutcome, BatchProcessingResult, BatchState, -}; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes}; use lighthouse_network::service::api_types::Id; use lighthouse_network::types::{BackFillState, NetworkGlobals}; use lighthouse_network::{PeerAction, PeerId}; use logging::crit; +use std::collections::hash_map::DefaultHasher; use std::collections::{ HashSet, btree_map::{BTreeMap, Entry}, }; +use std::hash::{Hash, Hasher}; +use std::marker::PhantomData; use std::sync::Arc; use tracing::{debug, error, info, warn}; use types::{ColumnIndex, Epoch, EthSpec}; @@ -49,21 +52,27 @@ const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 10; /// after `MAX_BATCH_PROCESSING_ATTEMPTS` times, it is considered faulty. const MAX_BATCH_PROCESSING_ATTEMPTS: u8 = 10; -/// Custom configuration for the batch object. -struct BackFillBatchConfig {} +type RpcBlocks = Vec>; -impl BatchConfig for BackFillBatchConfig { +type BackFillBatchInfo = BatchInfo, RpcBlocks>; + +type BackFillSyncBatches = BTreeMap>; + +/// Custom configuration for the batch object. +struct BackFillBatchConfig { + marker: PhantomData, +} + +impl BatchConfig for BackFillBatchConfig { fn max_batch_download_attempts() -> u8 { MAX_BATCH_DOWNLOAD_ATTEMPTS } fn max_batch_processing_attempts() -> u8 { MAX_BATCH_PROCESSING_ATTEMPTS } - fn batch_attempt_hash(blocks: &[RpcBlock]) -> u64 { - use std::collections::hash_map::DefaultHasher; - use std::hash::{Hash, Hasher}; + fn batch_attempt_hash(data: &D) -> u64 { let mut hasher = DefaultHasher::new(); - blocks.hash(&mut hasher); + data.hash(&mut hasher); hasher.finish() } } @@ -121,7 +130,7 @@ pub struct BackFillSync { last_batch_downloaded: bool, /// Sorted map of batches undergoing some kind of processing. - batches: BTreeMap>, + batches: BackFillSyncBatches, /// The current processing batch, if any. current_processing_batch: Option, @@ -210,7 +219,7 @@ impl BackFillSync { .network_globals .peers .read() - .synced_peers_for_epoch(self.to_be_downloaded, None) + .synced_peers_for_epoch(self.to_be_downloaded) .next() .is_some() // backfill can't progress if we do not have peers in the required subnets post peerdas. @@ -313,7 +322,6 @@ impl BackFillSync { CouplingError::DataColumnPeerFailure { error, faulty_peers, - action, exceeded_retries, } => { debug!(?batch_id, error, "Block components coupling error"); @@ -325,11 +333,8 @@ impl BackFillSync { failed_columns.insert(*column); failed_peers.insert(*peer); } - for peer in failed_peers.iter() { - network.report_peer(*peer, *action, "failed to return columns"); - } - // Only retry if peer failure **and** retries have been exceeded + // Only retry if peer failure **and** retries haven't been exceeded if !*exceeded_retries { return self.retry_partial_batch( network, @@ -353,7 +358,7 @@ impl BackFillSync { // reasons. Check that this block belongs to the expected peer // TODO(das): removed peer_id matching as the node may request a different peer for data // columns. - if !batch.is_expecting_block(&request_id) { + if !batch.is_expecting_request_id(&request_id) { return Ok(()); } debug!(batch_epoch = %batch_id, error = ?err, "Batch download failed"); @@ -397,12 +402,13 @@ impl BackFillSync { // sending an error /timeout) if the peer is removed from the chain for other // reasons. Check that this block belongs to the expected peer, and that the // request_id matches - if !batch.is_expecting_block(&request_id) { + if !batch.is_expecting_request_id(&request_id) { return Ok(ProcessResult::Successful); } + let received = blocks.len(); match batch.download_completed(blocks, *peer_id) { - Ok(received) => { + Ok(_) => { let awaiting_batches = self.processing_target.saturating_sub(batch_id) / BACKFILL_EPOCHS_PER_BATCH; debug!( @@ -888,7 +894,7 @@ impl BackFillSync { .network_globals .peers .read() - .synced_peers_for_epoch(batch_id, None) + .synced_peers_for_epoch(batch_id) .cloned() .collect::>(); @@ -899,6 +905,7 @@ impl BackFillSync { request, RangeRequestId::BackfillSync { batch_id }, &synced_peers, + &synced_peers, // All synced peers have imported up to the finalized slot so they must have their custody columns available &failed_peers, ) { Ok(request_id) => { @@ -964,7 +971,7 @@ impl BackFillSync { .network_globals() .peers .read() - .synced_peers_for_epoch(batch_id, None) + .synced_peers_for_epoch(batch_id) .cloned() .collect::>(); @@ -1053,7 +1060,7 @@ impl BackFillSync { // only request batches up to the buffer size limit // NOTE: we don't count batches in the AwaitingValidation state, to prevent stalling sync // if the current processing window is contained in a long range of skip slots. - let in_buffer = |batch: &BatchInfo| { + let in_buffer = |batch: &BackFillBatchInfo| { matches!( batch.state(), BatchState::Downloading(..) | BatchState::AwaitingProcessing(..) diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/batch.rs similarity index 77% rename from beacon_node/network/src/sync/range_sync/batch.rs rename to beacon_node/network/src/sync/batch.rs index 31e6594139..ea0ef15f4b 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/batch.rs @@ -2,29 +2,28 @@ use beacon_chain::block_verification_types::RpcBlock; use derivative::Derivative; use lighthouse_network::PeerId; use lighthouse_network::rpc::methods::BlocksByRangeRequest; +use lighthouse_network::rpc::methods::DataColumnsByRangeRequest; use lighthouse_network::service::api_types::Id; use std::collections::HashSet; -use std::fmt; -use std::hash::{Hash, Hasher}; +use std::hash::Hash; +use std::marker::PhantomData; use std::ops::Sub; -use std::time::{Duration, Instant}; +use std::time::Duration; +use std::time::Instant; use strum::Display; -use types::{Epoch, EthSpec, Slot}; +use types::Slot; +use types::{DataColumnSidecarList, Epoch, EthSpec}; -/// The number of times to retry a batch before it is considered failed. -const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 5; - -/// Invalid batches are attempted to be re-downloaded from other peers. If a batch cannot be processed -/// after `MAX_BATCH_PROCESSING_ATTEMPTS` times, it is considered faulty. -const MAX_BATCH_PROCESSING_ATTEMPTS: u8 = 3; +pub type BatchId = Epoch; /// Type of expected batch. -#[derive(Debug, Copy, Clone, Display)] +#[derive(Debug, Clone, Display)] #[strum(serialize_all = "snake_case")] pub enum ByRangeRequestType { BlocksAndColumns, BlocksAndBlobs, Blocks, + Columns(HashSet), } /// Allows customisation of the above constants used in other sync methods such as BackFillSync. @@ -60,28 +59,10 @@ pub trait BatchConfig { /// Note that simpler hashing functions considered in the past (hash of first block, hash of last /// block, number of received blocks) are not good enough to differentiate attempts. For this /// reason, we hash the complete set of blocks both in RangeSync and BackFillSync. - fn batch_attempt_hash(blocks: &[RpcBlock]) -> u64; + fn batch_attempt_hash(data: &D) -> u64; } #[derive(Debug)] -pub struct RangeSyncBatchConfig {} - -impl BatchConfig for RangeSyncBatchConfig { - fn max_batch_download_attempts() -> u8 { - MAX_BATCH_DOWNLOAD_ATTEMPTS - } - fn max_batch_processing_attempts() -> u8 { - MAX_BATCH_PROCESSING_ATTEMPTS - } - fn batch_attempt_hash(blocks: &[RpcBlock]) -> u64 { - let mut hasher = std::collections::hash_map::DefaultHasher::new(); - blocks.hash(&mut hasher); - hasher.finish() - } -} - -/// Error type of a batch in a wrong state. -// Such errors should never be encountered. pub struct WrongState(pub(crate) String); /// After batch operations, we use this to communicate whether a batch can continue or not @@ -100,28 +81,30 @@ pub enum BatchProcessingResult { #[derive(Derivative)] #[derivative(Debug)] /// A segment of a chain. -pub struct BatchInfo { +pub struct BatchInfo { /// Start slot of the batch. start_slot: Slot, /// End slot of the batch. end_slot: Slot, /// The `Attempts` that have been made and failed to send us this batch. - failed_processing_attempts: Vec, + failed_processing_attempts: Vec>, /// Number of processing attempts that have failed but we do not count. non_faulty_processing_attempts: u8, /// The number of download retries this batch has undergone due to a failed request. failed_download_attempts: Vec>, /// State of the batch. - state: BatchState, + state: BatchState, /// Whether this batch contains all blocks or all blocks and blobs. batch_type: ByRangeRequestType, /// Pin the generic #[derivative(Debug = "ignore")] - marker: std::marker::PhantomData, + marker: std::marker::PhantomData<(E, B)>, } -impl fmt::Display for BatchInfo { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl std::fmt::Display + for BatchInfo +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, "Start Slot: {}, End Slot: {}, State: {}", @@ -132,21 +115,21 @@ impl fmt::Display for BatchInfo { #[derive(Display)] /// Current state of a batch -pub enum BatchState { +pub enum BatchState { /// The batch has failed either downloading or processing, but can be requested again. AwaitingDownload, /// The batch is being downloaded. Downloading(Id), /// The batch has been completely downloaded and is ready for processing. - AwaitingProcessing(PeerId, Vec>, Instant), + AwaitingProcessing(PeerId, D, Instant), /// The batch is being processed. - Processing(Attempt), + Processing(Attempt), /// The batch was successfully processed and is waiting to be validated. /// /// It is not sufficient to process a batch successfully to consider it correct. This is /// because batches could be erroneously empty, or incomplete. Therefore, a batch is considered /// valid, only if the next sequential batch imports at least a block. - AwaitingValidation(Attempt), + AwaitingValidation(Attempt), /// Intermediate state for inner state handling. Poisoned, /// The batch has maxed out the allowed attempts for either downloading or processing. It @@ -154,14 +137,14 @@ pub enum BatchState { Failed, } -impl BatchState { +impl BatchState { /// Helper function for poisoning a state. - pub fn poison(&mut self) -> BatchState { + pub fn poison(&mut self) -> BatchState { std::mem::replace(self, BatchState::Poisoned) } } -impl BatchInfo { +impl BatchInfo { /// Batches are downloaded excluding the first block of the epoch assuming it has already been /// downloaded. /// @@ -178,13 +161,13 @@ impl BatchInfo { pub fn new(start_epoch: &Epoch, num_of_epochs: u64, batch_type: ByRangeRequestType) -> Self { let start_slot = start_epoch.start_slot(E::slots_per_epoch()); let end_slot = start_slot + num_of_epochs * E::slots_per_epoch(); - BatchInfo { + Self { start_slot, end_slot, failed_processing_attempts: Vec::new(), failed_download_attempts: Vec::new(), non_faulty_processing_attempts: 0, - state: BatchState::AwaitingDownload, + state: BatchState::::AwaitingDownload, batch_type, marker: std::marker::PhantomData, } @@ -208,8 +191,8 @@ impl BatchInfo { peers } - /// Verifies if an incoming block belongs to this batch. - pub fn is_expecting_block(&self, request_id: &Id) -> bool { + /// Verifies if an incoming request id to this batch. + pub fn is_expecting_request_id(&self, request_id: &Id) -> bool { if let BatchState::Downloading(expected_id) = &self.state { return expected_id == request_id; } @@ -227,30 +210,6 @@ impl BatchInfo { } } - /// Returns the count of stored pending blocks if in awaiting processing state - pub fn pending_blocks(&self) -> usize { - match &self.state { - BatchState::AwaitingProcessing(_, blocks, _) => blocks.len(), - BatchState::AwaitingDownload - | BatchState::Downloading { .. } - | BatchState::Processing { .. } - | BatchState::AwaitingValidation { .. } - | BatchState::Poisoned - | BatchState::Failed => 0, - } - } - - /// Returns a BlocksByRange request associated with the batch. - pub fn to_blocks_by_range_request(&self) -> (BlocksByRangeRequest, ByRangeRequestType) { - ( - BlocksByRangeRequest::new( - self.start_slot.into(), - self.end_slot.sub(self.start_slot).into(), - ), - self.batch_type, - ) - } - /// After different operations over a batch, this could be in a state that allows it to /// continue, or in failed state. When the batch has failed, we check if it did mainly due to /// processing failures. In this case the batch is considered failed and faulty. @@ -265,27 +224,22 @@ impl BatchInfo { } } - pub fn state(&self) -> &BatchState { + pub fn state(&self) -> &BatchState { &self.state } - pub fn attempts(&self) -> &[Attempt] { + pub fn attempts(&self) -> &[Attempt] { &self.failed_processing_attempts } - /// Marks the batch as ready to be processed if the blocks are in the range. The number of - /// received blocks is returned, or the wrong batch end on failure + /// Marks the batch as ready to be processed if the data columns are in the range. The number of + /// received columns is returned, or the wrong batch end on failure #[must_use = "Batch may have failed"] - pub fn download_completed( - &mut self, - blocks: Vec>, - peer: PeerId, - ) -> Result { + pub fn download_completed(&mut self, data_columns: D, peer: PeerId) -> Result<(), WrongState> { match self.state.poison() { BatchState::Downloading(_) => { - let received = blocks.len(); - self.state = BatchState::AwaitingProcessing(peer, blocks, Instant::now()); - Ok(received) + self.state = BatchState::AwaitingProcessing(peer, data_columns, Instant::now()); + Ok(()) } BatchState::Poisoned => unreachable!("Poisoned batch"), other => { @@ -334,6 +288,31 @@ impl BatchInfo { } } + /// Change the batch state from `Self::Downloading` to `Self::AwaitingDownload` without + /// registering a failed attempt. + /// + /// Note: must use this cautiously with some level of retry protection + /// as not registering a failed attempt could lead to requesting in a loop. + #[must_use = "Batch may have failed"] + pub fn downloading_to_awaiting_download( + &mut self, + ) -> Result { + match self.state.poison() { + BatchState::Downloading(_) => { + self.state = BatchState::AwaitingDownload; + Ok(self.outcome()) + } + BatchState::Poisoned => unreachable!("Poisoned batch"), + other => { + self.state = other; + Err(WrongState(format!( + "Download failed for batch in wrong state {:?}", + self.state + ))) + } + } + } + pub fn start_downloading(&mut self, request_id: Id) -> Result<(), WrongState> { match self.state.poison() { BatchState::AwaitingDownload => { @@ -351,17 +330,17 @@ impl BatchInfo { } } - pub fn start_processing(&mut self) -> Result<(Vec>, Duration), WrongState> { + pub fn start_processing(&mut self) -> Result<(D, Duration), WrongState> { match self.state.poison() { - BatchState::AwaitingProcessing(peer, blocks, start_instant) => { - self.state = BatchState::Processing(Attempt::new::(peer, &blocks)); - Ok((blocks, start_instant.elapsed())) + BatchState::AwaitingProcessing(peer, data_columns, start_instant) => { + self.state = BatchState::Processing(Attempt::new::(peer, &data_columns)); + Ok((data_columns, start_instant.elapsed())) } BatchState::Poisoned => unreachable!("Poisoned batch"), other => { self.state = other; Err(WrongState(format!( - "Starting procesing batch in wrong state {:?}", + "Starting processing batch in wrong state {:?}", self.state ))) } @@ -441,37 +420,86 @@ impl BatchInfo { } } -/// Represents a peer's attempt and providing the result for this batch. -/// -/// Invalid attempts will downscore a peer. -#[derive(PartialEq, Debug)] -pub struct Attempt { +// BatchInfo implementations for RangeSync +impl BatchInfo>> { + /// Returns a BlocksByRange request associated with the batch. + pub fn to_blocks_by_range_request(&self) -> (BlocksByRangeRequest, ByRangeRequestType) { + ( + BlocksByRangeRequest::new( + self.start_slot.into(), + self.end_slot.sub(self.start_slot).into(), + ), + self.batch_type.clone(), + ) + } + + /// Returns the count of stored pending blocks if in awaiting processing state + pub fn pending_blocks(&self) -> usize { + match &self.state { + BatchState::AwaitingProcessing(_, blocks, _) => blocks.len(), + BatchState::AwaitingDownload + | BatchState::Downloading { .. } + | BatchState::Processing { .. } + | BatchState::AwaitingValidation { .. } + | BatchState::Poisoned + | BatchState::Failed => 0, + } + } +} + +// BatchInfo implementation for CustodyBackFillSync +impl BatchInfo> { + /// Returns a DataColumnsByRange request associated with the batch. + pub fn to_data_columns_by_range_request( + &self, + ) -> Result { + match &self.batch_type { + ByRangeRequestType::Columns(columns) => Ok(DataColumnsByRangeRequest { + start_slot: self.start_slot.into(), + count: self.end_slot.sub(self.start_slot).into(), + columns: columns.clone().into_iter().collect(), + }), + _ => Err(WrongState( + "Custody backfill sync can only make data columns by range requests.".to_string(), + )), + } + } +} + +#[derive(Debug)] +pub struct Attempt { /// The peer that made the attempt. pub peer_id: PeerId, /// The hash of the blocks of the attempt. pub hash: u64, + /// Pin the generic. + marker: PhantomData, } -impl Attempt { - fn new(peer_id: PeerId, blocks: &[RpcBlock]) -> Self { - let hash = B::batch_attempt_hash(blocks); - Attempt { peer_id, hash } +impl Attempt { + fn new(peer_id: PeerId, data: &D) -> Self { + let hash = B::batch_attempt_hash(data); + Attempt { + peer_id, + hash, + marker: PhantomData, + } } } -impl std::fmt::Debug for BatchState { +impl std::fmt::Debug for BatchState { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - BatchState::Processing(Attempt { peer_id, hash: _ }) => { + BatchState::Processing(Attempt { peer_id, .. }) => { write!(f, "Processing({})", peer_id) } - BatchState::AwaitingValidation(Attempt { peer_id, hash: _ }) => { + BatchState::AwaitingValidation(Attempt { peer_id, .. }) => { write!(f, "AwaitingValidation({})", peer_id) } BatchState::AwaitingDownload => f.write_str("AwaitingDownload"), BatchState::Failed => f.write_str("Failed"), - BatchState::AwaitingProcessing(peer, blocks, _) => { - write!(f, "AwaitingProcessing({}, {} blocks)", peer, blocks.len()) + BatchState::AwaitingProcessing(peer, ..) => { + write!(f, "AwaitingProcessing({})", peer) } BatchState::Downloading(request_id) => { write!(f, "Downloading({})", request_id) @@ -481,7 +509,7 @@ impl std::fmt::Debug for BatchState { } } -impl BatchState { +impl BatchState { /// Creates a character representation/visualization for the batch state to display in logs for quicker and /// easier recognition fn visualize(&self) -> char { diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs index ffc79c1550..cd9276f7e3 100644 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ b/beacon_node/network/src/sync/block_sidecar_coupling.rs @@ -2,7 +2,7 @@ use beacon_chain::{ block_verification_types::RpcBlock, data_column_verification::CustodyDataColumn, get_block_root, }; use lighthouse_network::{ - PeerAction, PeerId, + PeerId, service::api_types::{ BlobsByRangeRequestId, BlocksByRangeRequestId, DataColumnsByRangeRequestId, }, @@ -36,7 +36,7 @@ pub struct RangeBlockComponentsRequest { pub(crate) request_span: Span, } -enum ByRangeRequest { +pub enum ByRangeRequest { Active(I), Complete(T), } @@ -63,7 +63,6 @@ pub(crate) enum CouplingError { DataColumnPeerFailure { error: String, faulty_peers: Vec<(ColumnIndex, PeerId)>, - action: PeerAction, exceeded_retries: bool, }, BlobPeerFailure(String), @@ -253,7 +252,6 @@ impl RangeBlockComponentsRequest { if let Err(CouplingError::DataColumnPeerFailure { error: _, faulty_peers, - action: _, exceeded_retries: _, }) = &resp { @@ -377,7 +375,6 @@ impl RangeBlockComponentsRequest { return Err(CouplingError::DataColumnPeerFailure { error: format!("No columns for block {block_root:?} with data"), faulty_peers: responsible_peers, - action: PeerAction::LowToleranceError, exceeded_retries, }); @@ -402,7 +399,6 @@ impl RangeBlockComponentsRequest { return Err(CouplingError::DataColumnPeerFailure { error: format!("Peers did not return column for block_root {block_root:?} {naughty_peers:?}"), faulty_peers: naughty_peers, - action: PeerAction::LowToleranceError, exceeded_retries }); } @@ -439,7 +435,7 @@ impl RangeBlockComponentsRequest { } impl ByRangeRequest { - fn finish(&mut self, id: I, data: T) -> Result<(), String> { + pub fn finish(&mut self, id: I, data: T) -> Result<(), String> { match self { Self::Active(expected_id) => { if expected_id != &id { @@ -452,7 +448,7 @@ impl ByRangeRequest { } } - fn to_finished(&self) -> Option<&T> { + pub fn to_finished(&self) -> Option<&T> { match self { Self::Active(_) => None, Self::Complete(data) => Some(data), @@ -468,10 +464,10 @@ mod tests { NumBlobs, generate_rand_block_and_blobs, generate_rand_block_and_data_columns, test_spec, }; use lighthouse_network::{ - PeerAction, PeerId, + PeerId, service::api_types::{ BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, - DataColumnsByRangeRequestId, Id, RangeRequestId, + DataColumnsByRangeRequestId, DataColumnsByRangeRequester, Id, RangeRequestId, }, }; use rand::SeedableRng; @@ -505,7 +501,7 @@ mod tests { fn columns_id( id: Id, - parent_request_id: ComponentsByRangeRequestId, + parent_request_id: DataColumnsByRangeRequester, ) -> DataColumnsByRangeRequestId { DataColumnsByRangeRequestId { id, @@ -602,7 +598,15 @@ mod tests { let columns_req_id = expects_custody_columns .iter() .enumerate() - .map(|(i, column)| (columns_id(i as Id, components_id), vec![*column])) + .map(|(i, column)| { + ( + columns_id( + i as Id, + DataColumnsByRangeRequester::ComponentsByRange(components_id), + ), + vec![*column], + ) + }) .collect::>(); let mut info = RangeBlockComponentsRequest::::new( blocks_req_id, @@ -661,7 +665,15 @@ mod tests { let columns_req_id = batched_column_requests .iter() .enumerate() - .map(|(i, columns)| (columns_id(i as Id, components_id), columns.clone())) + .map(|(i, columns)| { + ( + columns_id( + i as Id, + DataColumnsByRangeRequester::ComponentsByRange(components_id), + ), + columns.clone(), + ) + }) .collect::>(); let mut info = RangeBlockComponentsRequest::::new( @@ -742,7 +754,15 @@ mod tests { let columns_req_id = expected_custody_columns .iter() .enumerate() - .map(|(i, column)| (columns_id(i as Id, components_id), vec![*column])) + .map(|(i, column)| { + ( + columns_id( + i as Id, + DataColumnsByRangeRequester::ComponentsByRange(components_id), + ), + vec![*column], + ) + }) .collect::>(); let mut info = RangeBlockComponentsRequest::::new( blocks_req_id, @@ -785,7 +805,6 @@ mod tests { if let Err(super::CouplingError::DataColumnPeerFailure { error, faulty_peers, - action, exceeded_retries, }) = result { @@ -793,7 +812,6 @@ mod tests { assert_eq!(faulty_peers.len(), 2); // columns 3 and 4 missing assert_eq!(faulty_peers[0].0, 3); // column index 3 assert_eq!(faulty_peers[1].0, 4); // column index 4 - assert!(matches!(action, PeerAction::LowToleranceError)); assert!(!exceeded_retries); // First attempt, should be false } else { panic!("Expected PeerFailure error"); @@ -822,7 +840,15 @@ mod tests { let columns_req_id = expected_custody_columns .iter() .enumerate() - .map(|(i, column)| (columns_id(i as Id, components_id), vec![*column])) + .map(|(i, column)| { + ( + columns_id( + i as Id, + DataColumnsByRangeRequester::ComponentsByRange(components_id), + ), + vec![*column], + ) + }) .collect::>(); let mut info = RangeBlockComponentsRequest::::new( blocks_req_id, @@ -858,7 +884,10 @@ mod tests { assert!(result.is_err()); // AND: We retry with a new peer for the failed column - let new_columns_req_id = columns_id(10 as Id, components_id); + let new_columns_req_id = columns_id( + 10 as Id, + DataColumnsByRangeRequester::ComponentsByRange(components_id), + ); let failed_column_requests = vec![(new_columns_req_id, vec![2])]; info.reinsert_failed_column_requests(failed_column_requests) .unwrap(); @@ -904,7 +933,15 @@ mod tests { let columns_req_id = expected_custody_columns .iter() .enumerate() - .map(|(i, column)| (columns_id(i as Id, components_id), vec![*column])) + .map(|(i, column)| { + ( + columns_id( + i as Id, + DataColumnsByRangeRequester::ComponentsByRange(components_id), + ), + vec![*column], + ) + }) .collect::>(); let mut info = RangeBlockComponentsRequest::::new( blocks_req_id, @@ -957,13 +994,11 @@ mod tests { if let Err(super::CouplingError::DataColumnPeerFailure { error: _, faulty_peers, - action, exceeded_retries, }) = result { assert_eq!(faulty_peers.len(), 1); // column 2 missing assert_eq!(faulty_peers[0].0, 2); // column index 2 - assert!(matches!(action, PeerAction::LowToleranceError)); assert!(exceeded_retries); // Should be true after max retries } else { panic!("Expected PeerFailure error with exceeded_retries=true"); diff --git a/beacon_node/network/src/sync/custody_backfill_sync/mod.rs b/beacon_node/network/src/sync/custody_backfill_sync/mod.rs new file mode 100644 index 0000000000..69df3422e6 --- /dev/null +++ b/beacon_node/network/src/sync/custody_backfill_sync/mod.rs @@ -0,0 +1,1126 @@ +use std::{ + collections::{BTreeMap, HashSet, btree_map::Entry}, + marker::PhantomData, + sync::Arc, +}; + +use beacon_chain::{BeaconChain, BeaconChainTypes}; +use lighthouse_network::{ + NetworkGlobals, PeerAction, PeerId, + service::api_types::{CustodyBackFillBatchRequestId, CustodyBackfillBatchId}, + types::CustodyBackFillState, +}; +use lighthouse_tracing::SPAN_CUSTODY_BACKFILL_SYNC_BATCH_REQUEST; +use logging::crit; +use std::hash::{DefaultHasher, Hash, Hasher}; +use tracing::{debug, error, info, info_span, warn}; +use types::{DataColumnSidecarList, Epoch, EthSpec}; + +use crate::sync::{ + backfill_sync::{BACKFILL_EPOCHS_PER_BATCH, ProcessResult, SyncStart}, + batch::{ + BatchConfig, BatchId, BatchInfo, BatchOperationOutcome, BatchProcessingResult, BatchState, + ByRangeRequestType, + }, + block_sidecar_coupling::CouplingError, + manager::CustodyBatchProcessResult, + network_context::{RpcResponseError, SyncNetworkContext}, +}; + +/// The maximum number of batches to queue before requesting more. +const BACKFILL_BATCH_BUFFER_SIZE: u8 = 5; + +/// Columns are downloaded in batches from peers. This constant specifies how many epochs worth of +/// columns per batch are requested _at most_. A batch may request less columns to account for +/// already requested columns. There is a timeout for each batch request. If this value is too high, +/// we will negatively report peers with poor bandwidth. This can be set arbitrarily high, in which +/// case the responder will fill the response up to the max request size, assuming they have the +/// bandwidth to do so. +pub const CUSTODY_BACKFILL_EPOCHS_PER_BATCH: u64 = 1; + +type CustodyBackFillBatchInfo = + BatchInfo, DataColumnSidecarList>; +type CustodyBackFillBatches = BTreeMap>; + +#[derive(Debug)] +pub struct CustodyBackFillBatchConfig { + marker: PhantomData, +} + +impl BatchConfig for CustodyBackFillBatchConfig { + fn max_batch_download_attempts() -> u8 { + 5 + } + fn max_batch_processing_attempts() -> u8 { + 5 + } + fn batch_attempt_hash(data: &D) -> u64 { + let mut hasher = DefaultHasher::new(); + data.hash(&mut hasher); + hasher.finish() + } +} + +/// The ways a custody backfill sync can fail. +// The info in the enum variants is displayed in logging, clippy thinks it's dead code. +#[derive(Debug)] +pub enum CustodyBackfillError { + /// A batch failed to be downloaded. + BatchDownloadFailed(#[allow(dead_code)] BatchId), + /// A batch could not be processed. + BatchProcessingFailed(#[allow(dead_code)] BatchId), + /// A batch entered an invalid state. + BatchInvalidState(#[allow(dead_code)] BatchId, #[allow(dead_code)] String), + /// The sync algorithm entered an invalid state. + InvalidSyncState(#[allow(dead_code)] String), + /// The chain became paused. + Paused, +} + +pub struct CustodyBackFillSync { + /// Keeps track of the current progress of the custody backfill. + /// This only gets refreshed from the beacon chain if we enter a failed state. + current_start: BatchId, + + /// Starting epoch of the batch that needs to be processed next. + /// This is incremented as the chain advances. + processing_target: BatchId, + + /// The custody group count we are trying to fulfill up to the DA window. + /// This is used as an indicator to restart custody backfill sync if the cgc + /// was changed in the middle of a currently active sync. + cgc: u64, + + /// Run ID of this backfill process. Increments if sync restarts. Used to differentiate batch + /// results from different runs. + run_id: u64, + + /// Starting epoch of the next batch that needs to be downloaded. + to_be_downloaded: BatchId, + + /// Keeps track if we have requested the final batch. + last_batch_downloaded: bool, + + /// Sorted map of batches undergoing some kind of processing. + batches: CustodyBackFillBatches, + + /// The current processing batch, if any. + current_processing_batch: Option, + + /// Batches validated. + validated_batches: u64, + + /// These are batches that we've skipped because we have no columns to fetch for the epoch. + skipped_batches: HashSet, + + /// When a custody backfill sync fails, we keep track of whether a new fully synced peer has joined. + /// This signifies that we are able to attempt to restart a failed chain. + restart_failed_sync: bool, + + /// Reference to the beacon chain to obtain initial starting points for custody backfill sync. + beacon_chain: Arc>, + + /// Reference to the network globals in order to obtain valid peers to backfill columns from + /// (i.e synced peers). + network_globals: Arc>, +} + +impl CustodyBackFillSync { + pub fn new( + beacon_chain: Arc>, + network_globals: Arc>, + ) -> Self { + Self { + current_start: Epoch::new(0), + processing_target: Epoch::new(0), + cgc: 0, + run_id: 0, + to_be_downloaded: Epoch::new(0), + last_batch_downloaded: false, + batches: BTreeMap::new(), + skipped_batches: HashSet::new(), + current_processing_batch: None, + validated_batches: 0, + restart_failed_sync: false, + beacon_chain, + network_globals, + } + } + + /// Pauses the custody sync if it's currently syncing. + pub fn pause(&mut self, reason: String) { + if let CustodyBackFillState::Syncing = self.state() { + debug!(processed_epochs = %self.validated_batches, to_be_processed = %self.current_start,"Custody backfill sync paused"); + self.set_state(CustodyBackFillState::Pending(reason)); + } + } + + /// Checks if custody backfill sync should start and sets the missing columns + /// custody backfill sync will attempt to fetch. + /// The criteria to start custody sync is: + /// - The earliest data column epoch's custodied columns != previous epoch's custodied columns + /// - The earliest data column epoch is a finalied epoch + pub fn should_start_custody_backfill_sync(&mut self) -> bool { + let Some(da_boundary_epoch) = self.beacon_chain.get_column_da_boundary() else { + return false; + }; + + // This is the epoch in which we have met our current custody requirements + let Some(earliest_data_column_epoch) = + self.beacon_chain.earliest_custodied_data_column_epoch() + else { + return false; + }; + + // Check if we have missing columns between the da boundary and `earliest_data_column_epoch` + let missing_columns = self + .beacon_chain + .get_missing_columns_for_epoch(da_boundary_epoch); + + if !missing_columns.is_empty() { + let latest_finalized_epoch = self + .beacon_chain + .canonical_head + .cached_head() + .finalized_checkpoint() + .epoch; + + // Check that the earliest data column epoch is a finalized epoch. + return earliest_data_column_epoch <= latest_finalized_epoch; + } + + false + } + + fn restart_sync(&mut self) { + // Set state to paused + self.set_state(CustodyBackFillState::Pending( + "CGC count has changed and custody backfill sync needs to restart".to_string(), + )); + + // Remove all batches and active requests. + self.batches.clear(); + self.skipped_batches.clear(); + self.restart_failed_sync = false; + + // Reset all downloading and processing targets + // NOTE: Lets keep validated_batches for posterity + self.processing_target = Epoch::new(0); + self.to_be_downloaded = Epoch::new(0); + self.last_batch_downloaded = false; + self.current_processing_batch = None; + self.validated_batches = 0; + self.run_id += 1; + + self.set_start_epoch(); + self.set_cgc(); + } + + fn restart_if_required(&mut self) -> bool { + let cgc_at_head = self + .beacon_chain + .data_availability_checker + .custody_context() + .custody_group_count_at_head(&self.beacon_chain.spec); + + if cgc_at_head != self.cgc { + self.restart_sync(); + return true; + } + + false + } + + /// Starts syncing. + #[must_use = "A failure here indicates custody backfill sync has failed and the global sync state should be updated"] + pub fn start( + &mut self, + network: &mut SyncNetworkContext, + ) -> Result { + match self.state() { + CustodyBackFillState::Syncing => { + if self.restart_if_required() { + return Ok(SyncStart::NotSyncing); + } + + if self.check_completed() { + self.set_state(CustodyBackFillState::Completed); + return Ok(SyncStart::NotSyncing); + } + } + CustodyBackFillState::Pending(_) | CustodyBackFillState::Completed => { + if self.check_completed() { + self.set_state(CustodyBackFillState::Completed); + return Ok(SyncStart::NotSyncing); + } + self.set_cgc(); + + if !self.should_start_custody_backfill_sync() { + return Ok(SyncStart::NotSyncing); + } + self.set_start_epoch(); + if self + .network_globals + .peers + .read() + .synced_peers() + .next() + .is_some() + { + debug!( + run_id = self.run_id, + current_start = %self.current_start, + processing_target = %self.processing_target, + to_be_downloaded = %self.to_be_downloaded, + "Starting custody backfill sync" + ); + // If there are peers to resume with, begin the resume. + self.set_state(CustodyBackFillState::Syncing); + // Resume any previously failed batches. + self.resume_batches(network)?; + // begin requesting blocks from the peer pool, until all peers are exhausted. + self.request_batches(network)?; + + // start processing batches if needed + self.process_completed_batches(network)?; + } else { + return Ok(SyncStart::NotSyncing); + } + } + } + + let Some(column_da_boundary) = self.beacon_chain.get_column_da_boundary() else { + return Ok(SyncStart::NotSyncing); + }; + + Ok(SyncStart::Syncing { + completed: (self.validated_batches + * CUSTODY_BACKFILL_EPOCHS_PER_BATCH + * T::EthSpec::slots_per_epoch()) as usize, + remaining: self + .current_start + .end_slot(T::EthSpec::slots_per_epoch()) + .saturating_sub(column_da_boundary.start_slot(T::EthSpec::slots_per_epoch())) + .as_usize(), + }) + } + + fn set_cgc(&mut self) { + self.cgc = self + .beacon_chain + .data_availability_checker + .custody_context() + .custody_group_count_at_head(&self.beacon_chain.spec); + } + + fn set_start_epoch(&mut self) { + let earliest_data_column_epoch = self + .beacon_chain + .earliest_custodied_data_column_epoch() + .unwrap_or(Epoch::new(0)); + + self.current_start = earliest_data_column_epoch + 1; + self.processing_target = self.current_start; + self.to_be_downloaded = self.current_start; + } + + /// Attempts to request the next required batches from the peer pool. It will exhaust the peer + /// pool and left over batches until the batch buffer is reached or all peers are exhausted. + fn request_batches( + &mut self, + network: &mut SyncNetworkContext, + ) -> Result<(), CustodyBackfillError> { + if !matches!(self.state(), CustodyBackFillState::Syncing) { + return Ok(()); + } + + // find the next pending batch and request it from the peer + // Note: for this function to not infinite loop we must: + // - If `include_next_batch` returns Some we MUST increase the count of batches that are + // accounted in the `BACKFILL_BATCH_BUFFER_SIZE` limit in the `matches!` statement of + // that function. + while let Some(batch_id) = self.include_next_batch() { + // send the batch + self.send_batch(network, batch_id)?; + } + + // No more batches, simply stop + Ok(()) + } + + /// When resuming a chain, this function searches for batches that need to be re-downloaded and + /// transitions their state to redownload the batch. + fn resume_batches( + &mut self, + network: &mut SyncNetworkContext, + ) -> Result<(), CustodyBackfillError> { + let batch_ids_to_retry = self + .batches + .iter() + .filter_map(|(batch_id, batch)| { + // In principle there should only ever be on of these, and we could terminate the + // loop early, however the processing is negligible and we continue the search + // for robustness to handle potential future modification + if matches!(batch.state(), BatchState::AwaitingDownload) { + Some(*batch_id) + } else { + None + } + }) + .collect::>(); + + for batch_id in batch_ids_to_retry { + self.send_batch(network, batch_id)?; + } + Ok(()) + } + + /// Creates the next required batch from the chain. If there are no more batches required, + /// `None` is returned. + fn include_next_batch(&mut self) -> Option { + let Some(column_da_boundary) = self.beacon_chain.get_column_da_boundary() else { + return None; + }; + + let mut missing_columns = HashSet::new(); + + // Skip all batches (Epochs) that don't have missing columns. + for epoch in Epoch::range_inclusive_rev(self.to_be_downloaded, column_da_boundary) { + missing_columns = self.beacon_chain.get_missing_columns_for_epoch(epoch); + + if !missing_columns.is_empty() { + self.to_be_downloaded = epoch; + break; + } + + // This batch is being skipped, insert it into the skipped batches mapping. + self.skipped_batches.insert(epoch); + + if epoch == column_da_boundary { + return None; + } + } + + // Don't request batches before the column da boundary + if self.to_be_downloaded < column_da_boundary { + return None; + } + + // Don't request batches beyond the DA window + if self.last_batch_downloaded { + return None; + } + + // Only request batches up to the buffer size limit + // NOTE: we don't count batches in the AwaitingValidation state, to prevent stalling sync + // if the current processing window is contained in a long range of skip slots. + let in_buffer = |batch: &CustodyBackFillBatchInfo| { + matches!( + batch.state(), + BatchState::Downloading(..) | BatchState::AwaitingProcessing(..) + ) + }; + if self + .batches + .iter() + .filter(|&(_epoch, batch)| in_buffer(batch)) + .count() + > BACKFILL_BATCH_BUFFER_SIZE as usize + { + return None; + } + + let batch_id = self.to_be_downloaded; + + match self.batches.entry(batch_id) { + Entry::Occupied(_) => { + // this batch doesn't need downloading, let this same function decide the next batch + if self.would_complete(batch_id) { + self.last_batch_downloaded = true; + } + + self.to_be_downloaded = self + .to_be_downloaded + .saturating_sub(CUSTODY_BACKFILL_EPOCHS_PER_BATCH); + self.include_next_batch() + } + Entry::Vacant(entry) => { + entry.insert(BatchInfo::new( + &batch_id, + CUSTODY_BACKFILL_EPOCHS_PER_BATCH, + ByRangeRequestType::Columns(missing_columns), + )); + if self.would_complete(batch_id) { + self.last_batch_downloaded = true; + } + self.to_be_downloaded = self + .to_be_downloaded + .saturating_sub(CUSTODY_BACKFILL_EPOCHS_PER_BATCH); + Some(batch_id) + } + } + } + + /// Processes the batch with the given id. + /// The batch must exist and be ready for processing + fn process_batch( + &mut self, + network: &mut SyncNetworkContext, + batch_id: BatchId, + ) -> Result { + // Check if we need to restart custody backfill sync due to a recent cgc change + if self.restart_if_required() { + return Ok(ProcessResult::Successful); + } + + if self.state() != CustodyBackFillState::Syncing || self.current_processing_batch.is_some() + { + return Ok(ProcessResult::Successful); + } + + let Some(batch) = self.batches.get_mut(&batch_id) else { + return self + .fail_sync(CustodyBackfillError::InvalidSyncState(format!( + "Trying to process a batch that does not exist: {}", + batch_id + ))) + .map(|_| ProcessResult::Successful); + }; + + let (data_columns, _) = match batch.start_processing() { + Err(e) => { + return self + .fail_sync(CustodyBackfillError::BatchInvalidState(batch_id, e.0)) + .map(|_| ProcessResult::Successful); + } + Ok(v) => v, + }; + + self.current_processing_batch = Some(batch_id); + + if let Err(e) = network.beacon_processor().send_historic_data_columns( + CustodyBackfillBatchId { + epoch: batch_id, + run_id: self.run_id, + }, + data_columns, + ) { + crit!( + msg = "process_batch", + error = %e, + batch = ?self.processing_target, + "Failed to send data columns to processor." + ); + // This is unlikely to happen but it would stall syncing since the batch now has no + // data columns to continue, and the chain is expecting a processing result that won't + // arrive. To mitigate this, (fake) fail this processing so that the batch is + // re-downloaded. + self.on_batch_process_result( + network, + CustodyBackfillBatchId { + epoch: batch_id, + run_id: self.run_id, + }, + &CustodyBatchProcessResult::Error { peer_action: None }, + ) + } else { + Ok(ProcessResult::Successful) + } + } + + /// A data column has been received for a batch. + /// If the column correctly completes the batch it will be processed if possible. + /// If this returns an error, custody sync has failed and will be restarted once new peers + /// join the system. + /// The sync manager should update the global sync state on failure. + #[must_use = "A failure here indicates custody backfill sync has failed and the global sync state should be updated"] + pub fn on_data_column_response( + &mut self, + network: &mut SyncNetworkContext, + req_id: CustodyBackFillBatchRequestId, + peer_id: &PeerId, + resp: Result, RpcResponseError>, + ) -> Result { + if req_id.batch_id.run_id != self.run_id { + debug!(%req_id, "Ignoring custody backfill download response from different run_id"); + return Ok(ProcessResult::Successful); + } + + let batch_id = req_id.batch_id.epoch; + // check if we have this batch + let Some(batch) = self.batches.get_mut(&batch_id) else { + if !matches!(self.state(), CustodyBackFillState::Pending(_)) { + // A batch might get removed when custody sync advances, so this is non fatal. + debug!(epoch = %batch_id, "Received a column for unknown batch"); + } + return Ok(ProcessResult::Successful); + }; + + // A batch could be retried without the peer failing the request (disconnecting/ + // sending an error /timeout) if the peer is removed for other + // reasons. Check that this column belongs to the expected peer, and that the + // request_id matches + if !batch.is_expecting_request_id(&req_id.id) { + return Ok(ProcessResult::Successful); + } + + match resp { + Ok(data_columns) => { + let received = data_columns.len(); + + match batch.download_completed(data_columns, *peer_id) { + Ok(_) => { + let awaiting_batches = self.processing_target.saturating_sub(batch_id) + / CUSTODY_BACKFILL_EPOCHS_PER_BATCH; + debug!( + %req_id, + blocks = received, + %awaiting_batches, + "Completed batch received" + ); + + // pre-emptively request more columns from peers whilst we process current columns. + self.request_batches(network)?; + self.process_completed_batches(network) + } + Err(e) => { + self.fail_sync(CustodyBackfillError::BatchInvalidState(batch_id, e.0))?; + Ok(ProcessResult::Successful) + } + } + } + Err(err) => { + debug!(batch_epoch = %batch_id, error = ?err, "Batch download failed"); + + // If there are any coupling errors, penalize the appropriate peers + if let RpcResponseError::BlockComponentCouplingError(coupling_error) = err + && let CouplingError::DataColumnPeerFailure { + error, + faulty_peers, + exceeded_retries: _, + } = coupling_error + { + for (column_index, faulty_peer) in faulty_peers { + debug!( + ?error, + ?column_index, + ?faulty_peer, + "Custody backfill sync penalizing peer" + ); + network.report_peer( + faulty_peer, + PeerAction::LowToleranceError, + "Peer failed to serve column", + ); + } + } + + match batch.download_failed(Some(*peer_id)) { + Err(e) => { + self.fail_sync(CustodyBackfillError::BatchInvalidState(batch_id, e.0))?; + } + Ok(BatchOperationOutcome::Failed { blacklist: _ }) => { + self.fail_sync(CustodyBackfillError::BatchDownloadFailed(batch_id))?; + } + Ok(BatchOperationOutcome::Continue) => { + self.send_batch(network, batch_id)?; + } + } + Ok(ProcessResult::Successful) + } + } + } + + /// The beacon processor has completed processing a batch. This function handles the result + /// of the batch processor. + /// If an error is returned custody backfill sync has failed. + #[must_use = "A failure here indicates custody backfill sync has failed and the global sync state should be updated"] + pub fn on_batch_process_result( + &mut self, + network: &mut SyncNetworkContext, + custody_batch_id: CustodyBackfillBatchId, + result: &CustodyBatchProcessResult, + ) -> Result { + let batch_id = custody_batch_id.epoch; + if custody_batch_id.run_id != self.run_id { + debug!(batch = %custody_batch_id, "Ignoring custody backfill error from different run_id"); + return Ok(ProcessResult::Successful); + } + + // The first two cases are possible in regular sync, should not occur in custody backfill, but we + // keep this logic for handling potential processing race conditions. + // result + let batch = match &self.current_processing_batch { + Some(processing_id) if *processing_id != batch_id => { + debug!( + batch_epoch = %batch_id, + expected_batch_epoch = processing_id.as_u64(), + "Unexpected batch result" + ); + return Ok(ProcessResult::Successful); + } + None => { + debug!(%batch_id, "Chain was not expecting a batch result"); + return Ok(ProcessResult::Successful); + } + _ => { + // batch_id matches, continue + self.current_processing_batch = None; + + match self.batches.get_mut(&batch_id) { + Some(batch) => batch, + None => { + // This is an error. Fail the sync algorithm. + return self + .fail_sync(CustodyBackfillError::InvalidSyncState(format!( + "Current processing batch not found: {}", + batch_id + ))) + .map(|_| ProcessResult::Successful); + } + } + } + }; + + let Some(peer) = batch.processing_peer() else { + self.fail_sync(CustodyBackfillError::BatchInvalidState( + batch_id, + String::from("Peer does not exist"), + ))?; + return Ok(ProcessResult::Successful); + }; + + debug!( + ?result, + batch_id = %custody_batch_id, + %peer, + client = %network.client_type(peer), + "Custody backfill batch processed" + ); + + match result { + CustodyBatchProcessResult::Success { + imported_columns, .. + } => { + if let Err(e) = batch.processing_completed(BatchProcessingResult::Success) { + self.fail_sync(CustodyBackfillError::BatchInvalidState(batch_id, e.0))?; + } + + debug!(imported_count=?imported_columns, "Succesfully imported historical data columns"); + + self.advance_custody_backfill_sync(batch_id); + + let Some(column_da_boundary) = self.beacon_chain.get_column_da_boundary() else { + return Err(CustodyBackfillError::InvalidSyncState( + "Can't calculate column data availability boundary".to_string(), + )); + }; + + if batch_id == self.processing_target { + // Advance processing target to the previous epoch + // If the current processing target is above the column DA boundary + if self.processing_target > column_da_boundary { + self.processing_target = self + .processing_target + .saturating_sub(CUSTODY_BACKFILL_EPOCHS_PER_BATCH); + } + } + + // check if custody sync has completed syncing up to the DA window + if self.check_completed() { + info!( + validated_epochs = ?self.validated_batches, + run_id = self.run_id, + "Custody backfill sync completed" + ); + self.batches.clear(); + self.restart_failed_sync = false; + self.processing_target = self.current_start; + self.to_be_downloaded = self.current_start; + self.last_batch_downloaded = false; + self.current_processing_batch = None; + self.validated_batches = 0; + self.skipped_batches.clear(); + self.set_state(CustodyBackFillState::Completed); + self.beacon_chain.update_data_column_custody_info(None); + Ok(ProcessResult::SyncCompleted) + } else { + // custody sync is not completed + // attempt to request more batches + self.request_batches(network)?; + // attempt to process more batches + self.process_completed_batches(network) + } + } + CustodyBatchProcessResult::Error { peer_action } => { + match peer_action { + // Faulty failure + Some(peer_action) => { + match batch.processing_completed(BatchProcessingResult::FaultyFailure) { + Err(e) => { + // Batch was in the wrong state + self.fail_sync(CustodyBackfillError::BatchInvalidState( + batch_id, e.0, + )) + .map(|_| ProcessResult::Successful) + } + Ok(BatchOperationOutcome::Failed { blacklist: _ }) => { + warn!( + score_adjustment = ?peer_action, + batch_epoch = %batch_id, + "Custody backfill batch failed to download. Penalizing peers" + ); + self.fail_sync(CustodyBackfillError::BatchProcessingFailed( + batch_id, + )) + .map(|_| ProcessResult::Successful) + } + + Ok(BatchOperationOutcome::Continue) => { + self.advance_custody_backfill_sync(batch_id); + // Handle this invalid batch, that is within the re-process retries limit. + self.handle_invalid_batch(network, batch_id) + .map(|_| ProcessResult::Successful) + } + } + } + // Non faulty failure + None => { + if let Err(e) = + batch.processing_completed(BatchProcessingResult::NonFaultyFailure) + { + self.fail_sync(CustodyBackfillError::BatchInvalidState(batch_id, e.0))?; + } + self.send_batch(network, batch_id)?; + Ok(ProcessResult::Successful) + } + } + } + } + } + + /// Processes the next ready batch. + fn process_completed_batches( + &mut self, + network: &mut SyncNetworkContext, + ) -> Result { + // Only process batches if custody backfill is syncing and only process one batch at a time + if self.state() != CustodyBackFillState::Syncing || self.current_processing_batch.is_some() + { + return Ok(ProcessResult::Successful); + } + + // Don't try to process batches before the Fulu fork epoch since data columns don't exist + if let Some(fulu_fork_epoch) = self.beacon_chain.spec.fulu_fork_epoch + && self.processing_target < fulu_fork_epoch + { + return Ok(ProcessResult::Successful); + } + + // Check if we need to restart custody backfill sync due to a cgc change. + if self.restart_if_required() { + return Ok(ProcessResult::Successful); + } + + while self.skipped_batches.contains(&self.processing_target) { + self.skipped_batches.remove(&self.processing_target); + // Update data column custody info with the skipped batch + if let Err(e) = self + .beacon_chain + .safely_backfill_data_column_custody_info(self.processing_target) + { + // I can't see a scenario where this could happen, but if we don't + // handle this edge case custody backfill sync could be stuck indefinitely. + error!( + error=?e, + "Unable to update data column custody info, restarting sync" + ); + self.restart_sync(); + }; + self.processing_target -= BACKFILL_EPOCHS_PER_BATCH; + } + + // Find the id of the batch we are going to process. + if let Some(batch) = self.batches.get(&self.processing_target) { + let state = batch.state(); + match state { + BatchState::AwaitingProcessing(..) => { + return self.process_batch(network, self.processing_target); + } + BatchState::Downloading(..) => { + // Batch is not ready, nothing to process + } + // Batches can be in `AwaitingDownload` state if there weren't good data column subnet + // peers to send the request to. + BatchState::AwaitingDownload => return Ok(ProcessResult::Successful), + BatchState::AwaitingValidation(..) => { + // The batch is validated + } + BatchState::Poisoned => unreachable!("Poisoned batch"), + BatchState::Failed | BatchState::Processing(_) => { + // these are all inconsistent states: + // - Failed -> non recoverable batch. Columns should have been removed + // - AwaitingDownload -> A recoverable failed batch should have been + // re-requested. + // - Processing -> `self.current_processing_batch` is None + self.fail_sync(CustodyBackfillError::InvalidSyncState(String::from( + "Invalid expected batch state", + )))?; + return Ok(ProcessResult::Successful); + } + } + } else { + self.fail_sync(CustodyBackfillError::InvalidSyncState(format!( + "Batch not found for current processing target {}", + self.processing_target + )))?; + return Ok(ProcessResult::Successful); + } + Ok(ProcessResult::Successful) + } + + /// Removes any batches previous to the given `validating_epoch` and advance custody backfill sync + /// to `validating_epoch`. + /// + /// The `validating_epoch` must align with batch boundaries. + fn advance_custody_backfill_sync(&mut self, validating_epoch: Epoch) { + let Some(column_da_boundary) = self.beacon_chain.get_column_da_boundary() else { + return; + }; + // make sure this epoch produces an advancement, unless its at the column DA boundary + if validating_epoch >= self.current_start && validating_epoch > column_da_boundary { + return; + } + + // We can now validate higher batches than the current batch. Here we remove all + // batches that are higher than the current batch. We add on an extra + // `BACKFILL_EPOCHS_PER_BATCH` as `split_off` is inclusive. + let removed_batches = self + .batches + .split_off(&(validating_epoch + CUSTODY_BACKFILL_EPOCHS_PER_BATCH)); + + for (id, batch) in removed_batches.into_iter() { + self.validated_batches = self.validated_batches.saturating_add(1); + match batch.state() { + BatchState::Downloading(..) | BatchState::AwaitingValidation(..) => {} + BatchState::Failed | BatchState::Poisoned | BatchState::AwaitingDownload => { + crit!("Batch indicates inconsistent data columns while advancing custody sync") + } + BatchState::AwaitingProcessing(..) => {} + BatchState::Processing(_) => { + debug!(batch = %id, %batch, "Advancing custody sync while processing a batch"); + if let Some(processing_id) = self.current_processing_batch + && id >= processing_id + { + self.current_processing_batch = None; + } + } + } + } + + self.processing_target = self.processing_target.min(validating_epoch); + self.current_start = self.current_start.min(validating_epoch); + self.to_be_downloaded = self.to_be_downloaded.min(validating_epoch); + + if self.batches.contains_key(&self.to_be_downloaded) { + // if custody backfill sync is advanced by Range beyond the previous `self.to_be_downloaded`, we + // won't have this batch, so we need to request it. + self.to_be_downloaded -= CUSTODY_BACKFILL_EPOCHS_PER_BATCH; + } + debug!(?validating_epoch, processing_target = ?self.processing_target, "Custody backfill advanced"); + } + + /// An invalid batch has been received that could not be processed, but that can be retried. + /// + /// These events occur when a peer has successfully responded with columns, but the columns + /// received are incorrect or invalid. This indicates the peer has not performed as + /// intended and can result in down voting a peer. + fn handle_invalid_batch( + &mut self, + network: &mut SyncNetworkContext, + batch_id: BatchId, + ) -> Result<(), CustodyBackfillError> { + // The current batch could not be processed, indicating either the current or previous + // batches are invalid. + + // The previous batch could be incomplete due to the columns being too large to fit in + // a single RPC request or there could be consecutive empty batches which are not supposed + // to be there + + // The current (sub-optimal) strategy is to simply re-request all batches that could + // potentially be faulty. If a batch returns a different result than the original and + // results in successful processing, we downvote the original peer that sent us the batch. + + // this is our robust `processing_target`. All previous batches must be awaiting + // validation + let mut redownload_queue = Vec::new(); + + for (id, _) in self.batches.iter_mut().filter(|&(&id, _)| id > batch_id) { + redownload_queue.push(*id); + } + + // no batch maxed out it process attempts, so now the chain's volatile progress must be + // reset + self.processing_target = self.current_start; + + for id in redownload_queue { + self.send_batch(network, id)?; + } + // finally, re-request the failed batch. + self.send_batch(network, batch_id) + } + + /// Checks with the beacon chain if custody sync has completed. + fn check_completed(&mut self) -> bool { + if self.would_complete(self.current_start) { + // Check that the data column custody info `earliest_available_slot` + // is in an epoch that is less than or equal to the current DA boundary + let Some(earliest_data_column_epoch) = + self.beacon_chain.earliest_custodied_data_column_epoch() + else { + return false; + }; + + let Some(column_da_boundary) = self.beacon_chain.get_column_da_boundary() else { + return false; + }; + + return earliest_data_column_epoch <= column_da_boundary; + } + false + } + + /// Checks if custody backfill would complete by syncing to `start_epoch`. + fn would_complete(&self, start_epoch: Epoch) -> bool { + let Some(column_da_boundary) = self.beacon_chain.get_column_da_boundary() else { + return false; + }; + start_epoch <= column_da_boundary + } + + /// Requests the batch assigned to the given id from a given peer. + fn send_batch( + &mut self, + network: &mut SyncNetworkContext, + batch_id: BatchId, + ) -> Result<(), CustodyBackfillError> { + let span = info_span!(SPAN_CUSTODY_BACKFILL_SYNC_BATCH_REQUEST); + let _enter = span.enter(); + + if let Some(batch) = self.batches.get_mut(&batch_id) { + let synced_peers = self + .network_globals + .peers + .read() + .synced_peers_for_epoch(batch_id) + .cloned() + .collect::>(); + + let request = batch.to_data_columns_by_range_request().map_err(|_| { + CustodyBackfillError::InvalidSyncState( + "Can't convert to data column by range request".to_string(), + ) + })?; + let failed_peers = batch.failed_peers(); + + match network.custody_backfill_data_columns_batch_request( + request, + CustodyBackfillBatchId { + epoch: batch_id, + run_id: self.run_id, + }, + &synced_peers, + &failed_peers, + ) { + Ok(request_id) => { + // inform the batch about the new request + if let Err(e) = batch.start_downloading(request_id.id) { + return self + .fail_sync(CustodyBackfillError::BatchInvalidState(batch_id, e.0)); + } + debug!(epoch = %batch_id, %batch, "Requesting batch"); + + return Ok(()); + } + Err(e) => match e { + crate::sync::network_context::RpcRequestSendError::NoPeer(no_peer) => { + // If we are here we have no more synced peers + debug!( + "reason" = format!("insufficient_synced_peers({no_peer:?})"), + "Custody sync paused" + ); + self.pause("Insufficient peers".to_string()); + return Err(CustodyBackfillError::Paused); + } + crate::sync::network_context::RpcRequestSendError::InternalError(e) => { + // NOTE: under normal conditions this shouldn't happen but we handle it anyway + warn!(%batch_id, error = ?e, %batch,"Could not send batch request"); + // register the failed download and check if the batch can be retried + if let Err(e) = batch.start_downloading(1) { + return self + .fail_sync(CustodyBackfillError::BatchInvalidState(batch_id, e.0)); + } + + match batch.download_failed(None) { + Err(e) => self.fail_sync(CustodyBackfillError::BatchInvalidState( + batch_id, e.0, + ))?, + Ok(BatchOperationOutcome::Failed { blacklist: _ }) => { + self.fail_sync(CustodyBackfillError::BatchDownloadFailed(batch_id))? + } + Ok(BatchOperationOutcome::Continue) => { + return self.send_batch(network, batch_id); + } + } + } + }, + } + } + + Ok(()) + } + + /// The syncing process has failed. + /// + /// This resets past variables, to allow for a fresh start when resuming. + fn fail_sync(&mut self, error: CustodyBackfillError) -> Result<(), CustodyBackfillError> { + // Some errors shouldn't cause failure. + if matches!(error, CustodyBackfillError::Paused) { + return Ok(()); + } + + // Set the state + self.pause("Sync has failed".to_string()); + // Remove all batches and active requests. + self.batches.clear(); + self.restart_failed_sync = false; + + // Reset all downloading and processing targets + // NOTE: Lets keep validated_batches for posterity + self.processing_target = self.current_start; + self.to_be_downloaded = self.current_start; + self.last_batch_downloaded = false; + self.current_processing_batch = None; + self.restart_sync(); + + Err(error) + } + + pub fn state(&self) -> CustodyBackFillState { + self.network_globals.custody_sync_state.read().clone() + } + + /// Updates the global network state indicating the current state of a backfill sync. + pub fn set_state(&self, state: CustodyBackFillState) { + *self.network_globals.custody_sync_state.write() = state; + } + + /// A fully synced peer has joined us. + /// If we are in a failed state, update a local variable to indicate we are able to restart + /// the failed sync on the next attempt. + pub fn fully_synced_peer_joined(&mut self) { + if matches!(self.state(), CustodyBackFillState::Pending(_)) { + self.restart_failed_sync = true; + } + } +} diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index d7ba028054..338f21ce98 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -46,7 +46,8 @@ use crate::status::ToStatusMessage; use crate::sync::block_lookups::{ BlobRequestState, BlockComponent, BlockRequestState, CustodyRequestState, DownloadResult, }; -use crate::sync::network_context::PeerGroup; +use crate::sync::custody_backfill_sync::CustodyBackFillSync; +use crate::sync::network_context::{PeerGroup, RpcResponseResult}; use beacon_chain::block_verification_types::AsBlock; use beacon_chain::validator_monitor::timestamp_now; use beacon_chain::{ @@ -56,14 +57,16 @@ use futures::StreamExt; use lighthouse_network::SyncInfo; use lighthouse_network::rpc::RPCError; use lighthouse_network::service::api_types::{ - BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, CustodyRequester, - DataColumnsByRangeRequestId, DataColumnsByRootRequestId, DataColumnsByRootRequester, Id, - SingleLookupReqId, SyncRequestId, + BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, + CustodyBackFillBatchRequestId, CustodyBackfillBatchId, CustodyRequester, + DataColumnsByRangeRequestId, DataColumnsByRangeRequester, DataColumnsByRootRequestId, + DataColumnsByRootRequester, Id, SingleLookupReqId, SyncRequestId, }; use lighthouse_network::types::{NetworkGlobals, SyncState}; use lighthouse_network::{PeerAction, PeerId}; use logging::crit; use lru_cache::LRUTimeCache; +use slot_clock::SlotClock; use std::ops::Sub; use std::sync::Arc; use std::time::Duration; @@ -158,6 +161,12 @@ pub enum SyncMessage { result: BatchProcessResult, }, + /// A custody batch has been processed by the processor thread. + CustodyBatchProcessed { + batch_id: CustodyBackfillBatchId, + result: CustodyBatchProcessResult, + }, + /// Block processed BlockComponentProcessed { process_type: BlockProcessType, @@ -209,6 +218,19 @@ pub enum BatchProcessResult { NonFaultyFailure, } +/// The result of processing multiple data columns. +#[derive(Debug)] +pub enum CustodyBatchProcessResult { + /// The custody batch was completed successfully. It carries whether the sent batch contained data columns. + Success { + #[allow(dead_code)] + sent_columns: usize, + imported_columns: usize, + }, + /// The custody batch processing failed. + Error { peer_action: Option }, +} + /// The primary object for handling and driving all the current syncing logic. It maintains the /// current state of the syncing process, the number of useful peers, downloaded blocks and /// controls the logic behind both the long-range (batch) sync and the on-going potential parent @@ -229,6 +251,9 @@ pub struct SyncManager { /// Backfill syncing. backfill_sync: BackFillSync, + /// Custody syncing. + custody_backfill_sync: CustodyBackFillSync, + block_lookups: BlockLookups, /// debounce duplicated `UnknownBlockHashFromAttestation` for the same root peer tuple. A peer /// may forward us thousands of a attestations, each one triggering an individual event. Only @@ -288,7 +313,8 @@ impl SyncManager { fork_context.clone(), ), range_sync: RangeSync::new(beacon_chain.clone()), - backfill_sync: BackFillSync::new(beacon_chain.clone(), network_globals), + backfill_sync: BackFillSync::new(beacon_chain.clone(), network_globals.clone()), + custody_backfill_sync: CustodyBackFillSync::new(beacon_chain.clone(), network_globals), block_lookups: BlockLookups::new(), notified_unknown_roots: LRUTimeCache::new(Duration::from_secs( NOTIFIED_UNKNOWN_ROOT_EXPIRY_SECONDS, @@ -549,6 +575,7 @@ impl SyncManager { // inform the backfill sync that a new synced peer has joined us. if new_state.is_synced() { self.backfill_sync.fully_synced_peer_joined(); + self.custody_backfill_sync.fully_synced_peer_joined(); } } is_connected @@ -558,17 +585,18 @@ impl SyncManager { } } - /// Updates the global sync state, optionally instigating or pausing a backfill sync as well as + /// Updates the global sync state, optionally instigating or pausing a backfill or custody sync as well as /// logging any changes. /// /// The logic for which sync should be running is as follows: - /// - If there is a range-sync running (or required) pause any backfill and let range-sync + /// - If there is a range-sync running (or required) pause any backfill/custody sync and let range-sync /// complete. /// - If there is no current range sync, check for any requirement to backfill and either /// start/resume a backfill sync if required. The global state will be BackFillSync if a /// backfill sync is running. /// - If there is no range sync and no required backfill and we have synced up to the currently /// known peers, we consider ourselves synced. + /// - If there is no range sync and no required backfill we check if we need to execute a custody sync. fn update_sync_state(&mut self) { let new_state: SyncState = match self.range_sync.state() { Err(e) => { @@ -624,15 +652,51 @@ impl SyncManager { error!(error = ?e, "Backfill sync failed to start"); } } + + // If backfill is complete, check if we have a pending custody backfill to complete + let anchor_info = self.chain.store.get_anchor_info(); + if anchor_info.block_backfill_complete(self.chain.genesis_backfill_slot) { + match self.custody_backfill_sync.start(&mut self.network) { + Ok(SyncStart::Syncing { + completed, + remaining, + }) => { + sync_state = SyncState::CustodyBackFillSyncing { + completed, + remaining, + }; + } + Ok(SyncStart::NotSyncing) => {} // Ignore updating the state if custody sync state didn't start. + Err(e) => { + use crate::sync::custody_backfill_sync::CustodyBackfillError; + + match &e { + CustodyBackfillError::BatchDownloadFailed(_) + | CustodyBackfillError::BatchProcessingFailed(_) => { + debug!(error=?e, "Custody backfill batch processing or downloading failed"); + } + CustodyBackfillError::BatchInvalidState(_, reason) => { + error!(error=?e, reason, "Custody backfill sync failed due to invalid batch state") + } + CustodyBackfillError::InvalidSyncState(reason) => { + error!(error=?e, reason, "Custody backfill sync failed due to invalid sync state") + } + CustodyBackfillError::Paused => {} + } + } + } + } } // Return the sync state if backfilling is not required. sync_state } Some((RangeSyncType::Finalized, start_slot, target_slot)) => { - // If there is a backfill sync in progress pause it. + // Range sync is in progress. If there is a backfill or custody sync in progress pause it. #[cfg(not(feature = "disable-backfill"))] self.backfill_sync.pause(); + self.custody_backfill_sync + .pause("Range sync in progress".to_string()); SyncState::SyncingFinalized { start_slot, @@ -640,9 +704,12 @@ impl SyncManager { } } Some((RangeSyncType::Head, start_slot, target_slot)) => { - // If there is a backfill sync in progress pause it. + // Range sync is in progress. If there is a backfill or custody backfill sync + // in progress pause it. #[cfg(not(feature = "disable-backfill"))] self.backfill_sync.pause(); + self.custody_backfill_sync + .pause("Range sync in progress".to_string()); SyncState::SyncingHead { start_slot, @@ -662,7 +729,9 @@ impl SyncManager { if new_state.is_synced() && !matches!( old_state, - SyncState::Synced | SyncState::BackFillSyncing { .. } + SyncState::Synced + | SyncState::BackFillSyncing { .. } + | SyncState::CustodyBackFillSyncing { .. } ) { self.network.subscribe_core_topics(); @@ -693,6 +762,11 @@ impl SyncManager { let mut register_metrics_interval = tokio::time::interval(Duration::from_secs(5)); + // Trigger a sync state update every epoch. This helps check if we need to trigger a custody backfill sync. + let epoch_duration = + self.chain.slot_clock.slot_duration().as_secs() * T::EthSpec::slots_per_epoch(); + let mut epoch_interval = tokio::time::interval(Duration::from_secs(epoch_duration)); + // process any inbound messages loop { tokio::select! { @@ -711,6 +785,9 @@ impl SyncManager { _ = register_metrics_interval.tick() => { self.network.register_metrics(); } + _ = epoch_interval.tick() => { + self.update_sync_state(); + } } } } @@ -865,6 +942,21 @@ impl SyncManager { } } }, + SyncMessage::CustodyBatchProcessed { result, batch_id } => { + match self.custody_backfill_sync.on_batch_process_result( + &mut self.network, + batch_id, + &result, + ) { + Ok(ProcessResult::Successful) => {} + Ok(ProcessResult::SyncCompleted) => self.update_sync_state(), + Err(error) => { + error!(error = ?error, "Custody sync failed"); + // Update the global status + self.update_sync_state(); + } + } + } } } @@ -1081,11 +1173,13 @@ impl SyncManager { RpcEvent::from_chunk(data_column, seen_timestamp), ); } - SyncRequestId::DataColumnsByRange(id) => self.on_data_columns_by_range_response( - id, - peer_id, - RpcEvent::from_chunk(data_column, seen_timestamp), - ), + SyncRequestId::DataColumnsByRange(req_id) => { + self.on_data_columns_by_range_response( + req_id, + peer_id, + RpcEvent::from_chunk(data_column, seen_timestamp), + ); + } _ => { crit!(%peer_id, "bad request id for data_column"); } @@ -1173,11 +1267,22 @@ impl SyncManager { .network .on_data_columns_by_range_response(id, peer_id, data_column) { - self.on_range_components_response( - id.parent_request_id, - peer_id, - RangeBlockComponent::CustodyColumns(id, resp), - ); + match id.parent_request_id { + DataColumnsByRangeRequester::ComponentsByRange(components_by_range_req_id) => { + self.on_range_components_response( + components_by_range_req_id, + peer_id, + RangeBlockComponent::CustodyColumns(id, resp), + ); + } + DataColumnsByRangeRequester::CustodyBackfillSync(custody_backfill_req_id) => self + .on_custody_backfill_columns_response( + custody_backfill_req_id, + id, + peer_id, + resp, + ), + } } } @@ -1267,6 +1372,36 @@ impl SyncManager { } } } + + /// Handles receiving a response for a custody range sync request that has columns. + fn on_custody_backfill_columns_response( + &mut self, + custody_sync_request_id: CustodyBackFillBatchRequestId, + req_id: DataColumnsByRangeRequestId, + peer_id: PeerId, + data_columns: RpcResponseResult>>>, + ) { + if let Some(resp) = self.network.custody_backfill_data_columns_response( + custody_sync_request_id, + req_id, + data_columns, + ) { + match self.custody_backfill_sync.on_data_column_response( + &mut self.network, + custody_sync_request_id, + &peer_id, + resp, + ) { + Ok(ProcessResult::SyncCompleted) => self.update_sync_state(), + Ok(ProcessResult::Successful) => {} + Err(_e) => { + // The custody sync has failed, errors are reported + // within. + self.update_sync_state(); + } + } + } + } } impl From> for BlockProcessingResult { diff --git a/beacon_node/network/src/sync/mod.rs b/beacon_node/network/src/sync/mod.rs index 4dab2e17d3..054bab654c 100644 --- a/beacon_node/network/src/sync/mod.rs +++ b/beacon_node/network/src/sync/mod.rs @@ -2,14 +2,17 @@ //! //! Stores the various syncing methods for the beacon chain. mod backfill_sync; +mod batch; mod block_lookups; mod block_sidecar_coupling; +mod custody_backfill_sync; pub mod manager; mod network_context; mod peer_sync_info; +mod range_data_column_batch_request; mod range_sync; #[cfg(test)] mod tests; pub use manager::{BatchProcessResult, SyncMessage}; -pub use range_sync::{BatchOperationOutcome, ChainId}; +pub use range_sync::ChainId; diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index ac2991c147..2e0c56db23 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -6,16 +6,17 @@ pub use self::requests::{BlocksByRootSingleRequest, DataColumnsByRootSingleBlock use super::SyncMessage; use super::block_sidecar_coupling::RangeBlockComponentsRequest; use super::manager::BlockProcessType; -use super::range_sync::ByRangeRequestType; use crate::metrics; use crate::network_beacon_processor::NetworkBeaconProcessor; #[cfg(test)] use crate::network_beacon_processor::TestBeaconChainType; use crate::service::NetworkMessage; use crate::status::ToStatusMessage; +use crate::sync::batch::ByRangeRequestType; use crate::sync::block_lookups::SingleLookupId; use crate::sync::block_sidecar_coupling::CouplingError; use crate::sync::network_context::requests::BlobsByRootSingleBlockRequest; +use crate::sync::range_data_column_batch_request::RangeDataColumnBatchRequest; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes, BlockProcessStatus, EngineState}; use custody::CustodyRequestResult; @@ -25,7 +26,8 @@ use lighthouse_network::rpc::{BlocksByRangeRequest, GoodbyeReason, RPCError, Req pub use lighthouse_network::service::api_types::RangeRequestId; use lighthouse_network::service::api_types::{ AppRequestId, BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, - CustodyId, CustodyRequester, DataColumnsByRangeRequestId, DataColumnsByRootRequestId, + CustodyBackFillBatchRequestId, CustodyBackfillBatchId, CustodyId, CustodyRequester, + DataColumnsByRangeRequestId, DataColumnsByRangeRequester, DataColumnsByRootRequestId, DataColumnsByRootRequester, Id, SingleLookupReqId, SyncRequestId, }; use lighthouse_network::{Client, NetworkGlobals, PeerAction, PeerId, ReportSource}; @@ -211,7 +213,6 @@ pub struct SyncNetworkContext { /// A mapping of active DataColumnsByRange requests data_columns_by_range_requests: ActiveRequests>, - /// Mapping of active custody column requests for a block root custody_by_root_requests: FnvHashMap>, @@ -219,6 +220,10 @@ pub struct SyncNetworkContext { components_by_range_requests: FnvHashMap>, + /// A batch of data columns by range request for custody sync + custody_backfill_data_column_batch_requests: + FnvHashMap>, + /// Whether the ee is online. If it's not, we don't allow access to the /// `beacon_processor_send`. execution_engine_state: EngineState, @@ -295,6 +300,7 @@ impl SyncNetworkContext { data_columns_by_range_requests: ActiveRequests::new("data_columns_by_range"), custody_by_root_requests: <_>::default(), components_by_range_requests: FnvHashMap::default(), + custody_backfill_data_column_batch_requests: FnvHashMap::default(), network_beacon_processor, chain, fork_context, @@ -324,6 +330,7 @@ impl SyncNetworkContext { custody_by_root_requests: _, // components_by_range_requests is a meta request of various _by_range requests components_by_range_requests: _, + custody_backfill_data_column_batch_requests: _, execution_engine_state: _, network_beacon_processor: _, chain: _, @@ -354,7 +361,6 @@ impl SyncNetworkContext { .active_requests_of_peer(peer_id) .into_iter() .map(|req_id| SyncRequestId::DataColumnsByRange(*req_id)); - blocks_by_root_ids .chain(blobs_by_root_ids) .chain(data_column_by_root_ids) @@ -421,6 +427,7 @@ impl SyncNetworkContext { custody_by_root_requests: _, // components_by_range_requests is a meta request of various _by_range requests components_by_range_requests: _, + custody_backfill_data_column_batch_requests: _, execution_engine_state: _, network_beacon_processor: _, chain: _, @@ -503,7 +510,7 @@ impl SyncNetworkContext { count: *request.count(), columns, }, - id, + DataColumnsByRangeRequester::ComponentsByRange(id), new_range_request_span!( self, "outgoing_columns_by_range_retry", @@ -533,19 +540,21 @@ impl SyncNetworkContext { batch_type: ByRangeRequestType, request: BlocksByRangeRequest, requester: RangeRequestId, - peers: &HashSet, + block_peers: &HashSet, + column_peers: &HashSet, peers_to_deprioritize: &HashSet, ) -> Result { let range_request_span = debug_span!( parent: None, SPAN_OUTGOING_RANGE_REQUEST, range_req_id = %requester, - peers = peers.len() + block_peers = block_peers.len(), + column_peers = column_peers.len() ); let _guard = range_request_span.clone().entered(); let active_request_count_by_peer = self.active_request_count_by_peer(); - let Some(block_peer) = peers + let Some(block_peer) = block_peers .iter() .map(|peer| { ( @@ -579,7 +588,7 @@ impl SyncNetworkContext { .collect(); Some(self.select_columns_by_range_peers_to_request( &column_indexes, - peers, + column_peers, active_request_count_by_peer, peers_to_deprioritize, )?) @@ -636,7 +645,7 @@ impl SyncNetworkContext { count: *request.count(), columns, }, - id, + DataColumnsByRangeRequester::ComponentsByRange(id), new_range_request_span!( self, "outgoing_columns_by_range", @@ -770,7 +779,6 @@ impl SyncNetworkContext { let range_req = entry.get_mut(); if let Some(blocks_result) = range_req.responses(&self.chain.spec) { if let Err(CouplingError::DataColumnPeerFailure { - action: _, error, faulty_peers: _, exceeded_retries, @@ -1237,7 +1245,7 @@ impl SyncNetworkContext { &mut self, peer_id: PeerId, request: DataColumnsByRangeRequest, - parent_request_id: ComponentsByRangeRequestId, + parent_request_id: DataColumnsByRangeRequester, request_span: Span, ) -> Result<(DataColumnsByRangeRequestId, Vec), RpcRequestSendError> { let requested_columns = request.columns.clone(); @@ -1678,6 +1686,111 @@ impl SyncNetworkContext { }) } + /// data column by range requests sent by the custody sync algorithm + pub fn custody_backfill_data_columns_batch_request( + &mut self, + request: DataColumnsByRangeRequest, + batch_id: CustodyBackfillBatchId, + peers: &HashSet, + peers_to_deprioritize: &HashSet, + ) -> Result { + let active_request_count_by_peer = self.active_request_count_by_peer(); + // Attempt to find all required custody peers before sending any request or creating an ID + let columns_by_range_peers_to_request = { + let column_indexes = self + .chain + .sampling_columns_for_epoch(batch_id.epoch) + .iter() + .cloned() + .collect(); + + self.select_columns_by_range_peers_to_request( + &column_indexes, + peers, + active_request_count_by_peer, + peers_to_deprioritize, + )? + }; + + // Create the overall `custody_by_range` request id + let id = CustodyBackFillBatchRequestId { + id: self.next_id(), + batch_id, + }; + + let result = columns_by_range_peers_to_request + .iter() + .filter_map(|(peer_id, _)| { + self.send_data_columns_by_range_request( + *peer_id, + request.clone(), + DataColumnsByRangeRequester::CustodyBackfillSync(id), + Span::none(), + ) + .ok() + }) + .collect::>(); + + let range_data_column_batch_request = + RangeDataColumnBatchRequest::new(result, self.chain.clone(), batch_id.epoch); + + self.custody_backfill_data_column_batch_requests + .insert(id, range_data_column_batch_request); + + Ok(id) + } + + /// Received a data columns by range response from a custody sync request which batches them. + pub fn custody_backfill_data_columns_response( + &mut self, + // Identifies the custody backfill request for all data columns on this epoch + custody_sync_request_id: CustodyBackFillBatchRequestId, + // Identifies a specific data_columns_by_range request for *some* columns in this epoch. We + // pass them separately as DataColumnsByRangeRequestId parent is an enum and would require + // matching again. + req_id: DataColumnsByRangeRequestId, + data_columns: RpcResponseResult>, + ) -> Option, RpcResponseError>> { + let Entry::Occupied(mut entry) = self + .custody_backfill_data_column_batch_requests + .entry(custody_sync_request_id) + else { + metrics::inc_counter_vec( + &metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, + &["range_data_columns"], + ); + return None; + }; + + if let Err(e) = { + let request = entry.get_mut(); + data_columns.and_then(|(data_columns, _)| { + request + .add_custody_columns(req_id, data_columns.clone()) + .map_err(|e| { + RpcResponseError::BlockComponentCouplingError(CouplingError::InternalError( + e, + )) + }) + }) + } { + entry.remove(); + return Some(Err(e)); + } + + if let Some(data_column_result) = entry.get_mut().responses() { + if data_column_result.is_ok() { + // remove the entry only if it coupled successfully with + // no errors + entry.remove(); + } + // If the request is finished, dequeue everything + Some(data_column_result.map_err(RpcResponseError::BlockComponentCouplingError)) + } else { + None + } + } + pub(crate) fn register_metrics(&self) { for (id, count) in [ ("blocks_by_root", self.blocks_by_root_requests.len()), diff --git a/beacon_node/network/src/sync/range_data_column_batch_request.rs b/beacon_node/network/src/sync/range_data_column_batch_request.rs new file mode 100644 index 0000000000..542d99d97c --- /dev/null +++ b/beacon_node/network/src/sync/range_data_column_batch_request.rs @@ -0,0 +1,297 @@ +use std::collections::{HashMap, HashSet}; + +use crate::sync::block_sidecar_coupling::{ByRangeRequest, CouplingError}; +use crate::sync::network_context::MAX_COLUMN_RETRIES; +use beacon_chain::{BeaconChain, BeaconChainTypes}; +use itertools::Itertools; +use lighthouse_network::PeerId; +use lighthouse_network::service::api_types::DataColumnsByRangeRequestId; +use std::sync::Arc; +use types::{ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Epoch, EthSpec, Slot}; + +pub struct RangeDataColumnBatchRequest { + requests: HashMap< + DataColumnsByRangeRequestId, + ByRangeRequest>, + >, + /// The column indices corresponding to the request + column_peers: HashMap>, + expected_custody_columns: HashSet, + attempt: usize, + beacon_chain: Arc>, + epoch: Epoch, +} + +impl RangeDataColumnBatchRequest { + pub fn new( + by_range_requests: Vec<(DataColumnsByRangeRequestId, Vec)>, + beacon_chain: Arc>, + epoch: Epoch, + ) -> Self { + let requests = by_range_requests + .clone() + .into_iter() + .map(|(req, _)| (req, ByRangeRequest::Active(req))) + .collect::>(); + + let column_peers = by_range_requests.clone().into_iter().collect(); + + let expected_custody_columns = by_range_requests + .into_iter() + .flat_map(|(_, column_indices)| column_indices) + .collect(); + + Self { + requests, + column_peers, + expected_custody_columns, + beacon_chain, + epoch, + attempt: 0, + } + } + + pub fn add_custody_columns( + &mut self, + req_id: DataColumnsByRangeRequestId, + columns: Vec>>, + ) -> Result<(), String> { + let req = self + .requests + .get_mut(&req_id) + .ok_or(format!("unknown data columns by range req_id {req_id}"))?; + req.finish(req_id, columns) + } + + pub fn responses( + &mut self, + ) -> Option, CouplingError>> { + let mut received_columns_for_slot: HashMap> = + HashMap::new(); + let mut column_to_peer_id: HashMap = HashMap::new(); + + for column in self + .requests + .values() + .filter_map(|req| req.to_finished()) + .flatten() + { + received_columns_for_slot + .entry(column.slot()) + .or_default() + .push(column.clone()); + } + + // Note: this assumes that only 1 peer is responsible for a column + // with a batch. + for (id, columns) in self.column_peers.iter() { + for column in columns { + column_to_peer_id.insert(*column, id.peer); + } + } + + // An "attempt" is complete here after we have received a response for all the + // requests we made. i.e. `req.to_finished()` returns Some for all requests. + self.attempt += 1; + + let resp = self.responses_with_custody_columns( + received_columns_for_slot, + column_to_peer_id, + &self.expected_custody_columns, + self.attempt, + ); + + if let Err(CouplingError::DataColumnPeerFailure { + error: _, + faulty_peers, + exceeded_retries: _, + }) = &resp + { + for (_, peer) in faulty_peers.iter() { + // find the req id associated with the peer and + // delete it from the entries as we are going to make + // a separate attempt for those components. + self.requests.retain(|&k, _| k.peer != *peer); + } + } + Some(resp) + } + + fn responses_with_custody_columns( + &self, + mut received_columns_for_slot: HashMap>, + column_to_peer: HashMap, + expected_custody_columns: &HashSet, + attempt: usize, + ) -> Result, CouplingError> { + let mut naughty_peers = vec![]; + let mut result: DataColumnSidecarList = vec![]; + + let forward_blocks_iter = self + .beacon_chain + .forwards_iter_block_roots_until( + self.epoch.start_slot(T::EthSpec::slots_per_epoch()), + self.epoch.end_slot(T::EthSpec::slots_per_epoch()), + ) + .map_err(|_| { + CouplingError::InternalError("Failed to fetch block root iterator".to_string()) + })?; + + for block_iter_result in forward_blocks_iter { + let (block_root, slot) = block_iter_result.map_err(|_| { + CouplingError::InternalError("Failed to iterate block roots".to_string()) + })?; + + let Some(block) = self + .beacon_chain + .get_blinded_block(&block_root) + .ok() + .flatten() + else { + // The block root we are fetching is from the forwards block root iterator. This doesn't seem like a possible scenario. + return Err(CouplingError::InternalError( + "Block root from forwards block iterator not found in db".to_string(), + )); + }; + + let Some(columns) = received_columns_for_slot.remove(&slot) else { + // If at least one blob is expected for this slot but none have been served, penalize all peers + // The slot check ensures we arent checking a skipped slot. + if block.num_expected_blobs() != 0 && block.slot() == slot { + for column in expected_custody_columns { + if let Some(naughty_peer) = column_to_peer.get(column) { + naughty_peers.push((*column, *naughty_peer)); + } + } + } + continue; + }; + + // This is a skipped slot, skip to the next slot after we verify that peers + // didn't serve us columns for a skipped slot + if block.slot() != slot { + // If we received columns for a skipped slot, punish the peer + if !columns.is_empty() { + for column in expected_custody_columns { + if let Some(naughty_peer) = column_to_peer.get(column) { + naughty_peers.push((*column, *naughty_peer)); + } + } + } + + continue; + } + + let column_block_roots = columns + .iter() + .map(|column| column.block_root()) + .unique() + .collect::>(); + + let column_block_signatures = columns + .iter() + .map(|column| column.signed_block_header.signature.clone()) + .unique() + .collect::>(); + + let column_block_root = match column_block_roots.as_slice() { + // We expect a single unique block root + [column_block_root] => *column_block_root, + // If there are no block roots, penalize all peers + [] => { + for column in &columns { + if let Some(naughty_peer) = column_to_peer.get(&column.index) { + naughty_peers.push((column.index, *naughty_peer)); + } + } + continue; + } + // If theres more than one unique block root penalize the peers serving the bad block roots. + column_block_roots => { + for column in columns { + if column_block_roots.contains(&column.block_root()) + && block_root != column.block_root() + && let Some(naughty_peer) = column_to_peer.get(&column.index) + { + naughty_peers.push((column.index, *naughty_peer)); + } + } + continue; + } + }; + + let column_block_signature = match column_block_signatures.as_slice() { + // We expect a single unique block signature + [block_signature] => block_signature, + // If there are no block signatures, penalize all peers + [] => { + for column in &columns { + if let Some(naughty_peer) = column_to_peer.get(&column.index) { + naughty_peers.push((column.index, *naughty_peer)); + } + } + continue; + } + // If theres more than one unique block signature, penalize the peers serving the + // invalid block signatures. + column_block_signatures => { + for column in columns { + if column_block_signatures.contains(&column.signed_block_header.signature) + && block.signature() != &column.signed_block_header.signature + && let Some(naughty_peer) = column_to_peer.get(&column.index) + { + naughty_peers.push((column.index, *naughty_peer)); + } + } + continue; + } + }; + + // if the block root doesn't match the columns block root, penalize the peers + if block_root != column_block_root { + for column in &columns { + if let Some(naughty_peer) = column_to_peer.get(&column.index) { + naughty_peers.push((column.index, *naughty_peer)); + } + } + } + + // If the block signature doesn't match the columns block signature, penalize the peers + if block.signature() != column_block_signature { + for column in &columns { + if let Some(naughty_peer) = column_to_peer.get(&column.index) { + naughty_peers.push((column.index, *naughty_peer)); + } + } + } + + let received_columns = columns.iter().map(|c| c.index).collect::>(); + + let missing_columns = received_columns + .difference(expected_custody_columns) + .collect::>(); + + // blobs are expected for this slot but there is at least one missing columns + // penalize the peers responsible for those columns. + if block.num_expected_blobs() != 0 && !missing_columns.is_empty() { + for column in missing_columns { + if let Some(naughty_peer) = column_to_peer.get(column) { + naughty_peers.push((*column, *naughty_peer)); + }; + } + } + + result.extend(columns); + } + + if !naughty_peers.is_empty() { + return Err(CouplingError::DataColumnPeerFailure { + error: "Bad or missing columns for some slots".to_string(), + faulty_peers: naughty_peers, + exceeded_retries: attempt >= MAX_COLUMN_RETRIES, + }); + } + + Ok(result) + } +} diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 3b816c0922..014d728ffe 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -1,10 +1,13 @@ use super::RangeSyncType; -use super::batch::{BatchInfo, BatchProcessingResult, BatchState}; use crate::metrics; use crate::network_beacon_processor::ChainSegmentProcessId; +use crate::sync::batch::BatchId; +use crate::sync::batch::{ + BatchConfig, BatchInfo, BatchOperationOutcome, BatchProcessingResult, BatchState, +}; use crate::sync::block_sidecar_coupling::CouplingError; use crate::sync::network_context::{RangeRequestId, RpcRequestSendError, RpcResponseError}; -use crate::sync::{BatchOperationOutcome, BatchProcessResult, network_context::SyncNetworkContext}; +use crate::sync::{BatchProcessResult, network_context::SyncNetworkContext}; use beacon_chain::BeaconChainTypes; use beacon_chain::block_verification_types::RpcBlock; use lighthouse_network::service::api_types::Id; @@ -12,6 +15,8 @@ use lighthouse_network::{PeerAction, PeerId}; use lighthouse_tracing::SPAN_SYNCING_CHAIN; use logging::crit; use std::collections::{BTreeMap, HashSet, btree_map::Entry}; +use std::hash::{Hash, Hasher}; +use std::marker::PhantomData; use strum::IntoStaticStr; use tracing::{Span, debug, instrument, warn}; use types::{ColumnIndex, Epoch, EthSpec, Hash256, Slot}; @@ -35,6 +40,35 @@ const BATCH_BUFFER_SIZE: u8 = 5; /// and continued is now in an inconsistent state. pub type ProcessingResult = Result; +type RpcBlocks = Vec>; +type RangeSyncBatchInfo = BatchInfo, RpcBlocks>; +type RangeSyncBatches = BTreeMap>; + +/// The number of times to retry a batch before it is considered failed. +const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 5; + +/// Invalid batches are attempted to be re-downloaded from other peers. If a batch cannot be processed +/// after `MAX_BATCH_PROCESSING_ATTEMPTS` times, it is considered faulty. +const MAX_BATCH_PROCESSING_ATTEMPTS: u8 = 3; + +pub struct RangeSyncBatchConfig { + marker: PhantomData, +} + +impl BatchConfig for RangeSyncBatchConfig { + fn max_batch_download_attempts() -> u8 { + MAX_BATCH_DOWNLOAD_ATTEMPTS + } + fn max_batch_processing_attempts() -> u8 { + MAX_BATCH_PROCESSING_ATTEMPTS + } + fn batch_attempt_hash(data: &D) -> u64 { + let mut hasher = std::collections::hash_map::DefaultHasher::new(); + data.hash(&mut hasher); + hasher.finish() + } +} + /// Reasons for removing a chain #[derive(Debug)] #[allow(dead_code)] @@ -55,7 +89,6 @@ pub struct KeepChain; /// A chain identifier pub type ChainId = Id; -pub type BatchId = Epoch; #[derive(Debug, Copy, Clone, IntoStaticStr)] pub enum SyncingChainType { @@ -85,7 +118,7 @@ pub struct SyncingChain { pub target_head_root: Hash256, /// Sorted map of batches undergoing some kind of processing. - batches: BTreeMap>, + batches: RangeSyncBatches, /// The peers that agree on the `target_head_slot` and `target_head_root` as a canonical chain /// and thus available to download this chain from, as well as the batches we are currently @@ -249,7 +282,7 @@ impl SyncingChain { // request_id matches // TODO(das): removed peer_id matching as the node may request a different peer for data // columns. - if !batch.is_expecting_block(&request_id) { + if !batch.is_expecting_request_id(&request_id) { return Ok(KeepChain); } batch @@ -260,7 +293,8 @@ impl SyncingChain { // Remove the request from the peer's active batches // TODO(das): should use peer group here https://github.com/sigp/lighthouse/issues/6258 - let received = batch.download_completed(blocks, *peer_id)?; + let received = blocks.len(); + batch.download_completed(blocks, *peer_id)?; let awaiting_batches = batch_id .saturating_sub(self.optimistic_start.unwrap_or(self.processing_target)) / EPOCHS_PER_BATCH; @@ -871,7 +905,6 @@ impl SyncingChain { CouplingError::DataColumnPeerFailure { error, faulty_peers, - action, exceeded_retries, } => { debug!(?batch_id, error, "Block components coupling error"); @@ -883,12 +916,22 @@ impl SyncingChain { failed_columns.insert(*column); failed_peers.insert(*peer); } - for peer in failed_peers.iter() { - network.report_peer(*peer, *action, "failed to return columns"); - } // Retry the failed columns if the column requests haven't exceeded the // max retries. Otherwise, remove treat it as a failed batch below. if !*exceeded_retries { + // Set the batch back to `AwaitingDownload` before retrying. + // This is to ensure that the batch doesn't get stuck in `Downloading` state. + // + // DataColumn retries has a retry limit so calling `downloading_to_awaiting_download` + // is safe. + if let BatchOperationOutcome::Failed { blacklist } = + batch.downloading_to_awaiting_download()? + { + return Err(RemoveChain::ChainFailed { + blacklist, + failing_batch: batch_id, + }); + } return self.retry_partial_batch( network, batch_id, @@ -909,7 +952,7 @@ impl SyncingChain { // A batch could be retried without the peer failing the request (disconnecting/ // sending an error /timeout) if the peer is removed from the chain for other // reasons. Check that this block belongs to the expected peer - if !batch.is_expecting_block(&request_id) { + if !batch.is_expecting_request_id(&request_id) { debug!( batch_epoch = %batch_id, batch_state = ?batch.state(), @@ -936,7 +979,10 @@ impl SyncingChain { failing_batch: batch_id, }); } - self.send_batch(network, batch_id) + // The errored batch is set to AwaitingDownload above. + // We now just attempt to download all batches stuck in `AwaitingDownload` + // state in the right order. + self.attempt_send_awaiting_download_batches(network, "injecting error") } else { debug!( batch_epoch = %batch_id, @@ -969,7 +1015,7 @@ impl SyncingChain { .collect(); debug!( ?awaiting_downloads, - src, "Attempting to send batches awaiting downlaod" + src, "Attempting to send batches awaiting download" ); for batch_id in awaiting_downloads { @@ -998,11 +1044,11 @@ impl SyncingChain { let (request, batch_type) = batch.to_blocks_by_range_request(); let failed_peers = batch.failed_peers(); - let synced_peers = network + let synced_column_peers = network .network_globals() .peers .read() - .synced_peers_for_epoch(batch_id, Some(&self.peers)) + .synced_peers_for_epoch(batch_id) .cloned() .collect::>(); @@ -1013,7 +1059,13 @@ impl SyncingChain { chain_id: self.id, batch_id, }, - &synced_peers, + // Request blocks only from peers of this specific chain + &self.peers, + // Request column from all synced peers, even if they are not part of this chain. + // This is to avoid splitting of good column peers across many head chains in a heavy forking + // environment. If the column peers and block peer are on different chains, then we return + // a coupling error and retry only the columns that failed to couple. See `Self::retry_partial_batch`. + &synced_column_peers, &failed_peers, ) { Ok(request_id) => { @@ -1081,7 +1133,7 @@ impl SyncingChain { .network_globals() .peers .read() - .synced_peers_for_epoch(batch_id, Some(&self.peers)) + .synced_peers_for_epoch(batch_id) .cloned() .collect::>(); @@ -1093,6 +1145,8 @@ impl SyncingChain { &failed_columns, ) { Ok(_) => { + // inform the batch about the new request + batch.start_downloading(id)?; debug!( ?batch_id, id, "Retried column requests from different peers" @@ -1100,6 +1154,8 @@ impl SyncingChain { return Ok(KeepChain); } Err(e) => { + // No need to explicitly fail the batch since its in `AwaitingDownload` state + // before we attempted to retry. debug!(?batch_id, id, e, "Failed to retry partial batch"); } } @@ -1123,6 +1179,9 @@ impl SyncingChain { ) -> Result { let _guard = self.span.clone().entered(); debug!("Resuming chain"); + // attempt to download any batches stuck in the `AwaitingDownload` state because of + // a lack of peers before. + self.attempt_send_awaiting_download_batches(network, "resume")?; // Request more batches if needed. self.request_batches(network)?; // If there is any batch ready for processing, send it. @@ -1208,7 +1267,7 @@ impl SyncingChain { // only request batches up to the buffer size limit // NOTE: we don't count batches in the AwaitingValidation state, to prevent stalling sync // if the current processing window is contained in a long range of skip slots. - let in_buffer = |batch: &BatchInfo| { + let in_buffer = |batch: &RangeSyncBatchInfo| { matches!( batch.state(), BatchState::Downloading(..) | BatchState::AwaitingProcessing(..) @@ -1295,7 +1354,7 @@ impl SyncingChain { } } -use super::batch::WrongState as WrongBatchState; +use crate::sync::batch::WrongState as WrongBatchState; impl From for RemoveChain { fn from(err: WrongBatchState) -> Self { RemoveChain::WrongBatchState(err.0) diff --git a/beacon_node/network/src/sync/range_sync/mod.rs b/beacon_node/network/src/sync/range_sync/mod.rs index 8f881fba90..dd9f17bfd1 100644 --- a/beacon_node/network/src/sync/range_sync/mod.rs +++ b/beacon_node/network/src/sync/range_sync/mod.rs @@ -1,17 +1,11 @@ //! This provides the logic for syncing a chain when the local node is far behind it's current //! peers. - -mod batch; mod chain; mod chain_collection; mod range; mod sync_type; -pub use batch::{ - BatchConfig, BatchInfo, BatchOperationOutcome, BatchProcessingResult, BatchState, - ByRangeRequestType, -}; -pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH}; +pub use chain::{ChainId, EPOCHS_PER_BATCH}; #[cfg(test)] pub use chain_collection::SyncChainStatus; pub use range::RangeSync; diff --git a/beacon_node/network/src/sync/range_sync/range.rs b/beacon_node/network/src/sync/range_sync/range.rs index 465edd3697..c9656ad1d0 100644 --- a/beacon_node/network/src/sync/range_sync/range.rs +++ b/beacon_node/network/src/sync/range_sync/range.rs @@ -39,12 +39,13 @@ //! Each chain is downloaded in batches of blocks. The batched blocks are processed sequentially //! and further batches are requested as current blocks are being processed. -use super::chain::{BatchId, ChainId, RemoveChain, SyncingChain}; +use super::chain::{ChainId, RemoveChain, SyncingChain}; use super::chain_collection::{ChainCollection, SyncChainStatus}; use super::sync_type::RangeSyncType; use crate::metrics; use crate::status::ToStatusMessage; use crate::sync::BatchProcessResult; +use crate::sync::batch::BatchId; use crate::sync::network_context::{RpcResponseError, SyncNetworkContext}; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes}; diff --git a/beacon_node/src/cli.rs b/beacon_node/src/cli.rs index 2e3b3fde4b..e4c7c6ff1f 100644 --- a/beacon_node/src/cli.rs +++ b/beacon_node/src/cli.rs @@ -60,22 +60,34 @@ pub fn cli_app() -> Command { .display_order(0) ) .arg( - // TODO(das): remove this before PeerDAS release + Arg::new("semi-supernode") + .long("semi-supernode") + .action(ArgAction::SetTrue) + .help_heading(FLAG_HEADER) + .conflicts_with("supernode") + .help("Run in minimal reconstruction mode. This node will subscribe to and custody \ + half of the data columns (enough for reconstruction), enabling efficient \ + data availability with lower bandwidth and storage requirements compared to \ + a supernode, while still supporting full blob reconstruction.") + .display_order(0) + ) + .arg( Arg::new("malicious-withhold-count") .long("malicious-withhold-count") .action(ArgAction::Set) .help_heading(FLAG_HEADER) - .help("TESTING ONLY do not use this") + .help("TESTING ONLY: Withholds a subset of data columns during publishing. \ + Do not use in production. Requires the 'testing' feature to be enabled.") .hide(true) .display_order(0) ) .arg( - // TODO(das): remove this before PeerDAS release Arg::new("advertise-false-custody-group-count") .long("advertise-false-custody-group-count") .action(ArgAction::Set) .help_heading(FLAG_HEADER) - .help("Advertises a false CGC for testing PeerDAS. Do NOT use in production.") + .help("TESTING ONLY: Advertises a false custody group count for testing PeerDAS. \ + Do not use in production. Requires the 'testing' feature to be enabled.") .hide(true) .display_order(0) ) @@ -1594,9 +1606,9 @@ pub fn cli_app() -> Command { .value_name("SECONDS") .action(ArgAction::Set) .help_heading(FLAG_HEADER) - .help("TESTING ONLY: Artificially delay block publishing by the specified number of seconds. \ - This only works for if `BroadcastValidation::Gossip` is used (default). \ - DO NOT USE IN PRODUCTION.") + .help("TESTING ONLY: Artificially delays block publishing by the specified number of seconds. \ + This only works if BroadcastValidation::Gossip is used (default). \ + Do not use in production. Requires the 'testing' feature to be enabled.") .hide(true) .display_order(0) ) @@ -1606,10 +1618,10 @@ pub fn cli_app() -> Command { .value_name("SECONDS") .action(ArgAction::Set) .help_heading(FLAG_HEADER) - .help("TESTING ONLY: Artificially delay data column publishing by the specified number of seconds. \ - Limitation: If `delay-block-publishing` is also used, data columns will be delayed for a \ - minimum of `delay-block-publishing` seconds. - DO NOT USE IN PRODUCTION.") + .help("TESTING ONLY: Artificially delays data column publishing by the specified number of seconds. \ + Limitation: If delay-block-publishing is also used, data columns will be delayed for a \ + minimum of delay-block-publishing seconds. \ + Do not use in production. Requires the 'testing' feature to be enabled.") .hide(true) .display_order(0) ) diff --git a/beacon_node/src/config.rs b/beacon_node/src/config.rs index c2599ec0cd..3b0e80e0b7 100644 --- a/beacon_node/src/config.rs +++ b/beacon_node/src/config.rs @@ -4,10 +4,11 @@ use beacon_chain::chain_config::{ DEFAULT_RE_ORG_MAX_EPOCHS_SINCE_FINALIZATION, DEFAULT_RE_ORG_PARENT_THRESHOLD, DisallowedReOrgOffsets, INVALID_HOLESKY_BLOCK_ROOT, ReOrgThreshold, }; +use beacon_chain::custody_context::NodeCustodyType; use beacon_chain::graffiti_calculator::GraffitiOrigin; use clap::{ArgMatches, Id, parser::ValueSource}; use clap_utils::flags::DISABLE_MALLOC_TUNING_FLAG; -use clap_utils::{parse_flag, parse_optional, parse_required}; +use clap_utils::{parse_flag, parse_required}; use client::{ClientConfig, ClientGenesis}; use directory::{DEFAULT_BEACON_NODE_DIR, DEFAULT_NETWORK_DIR, DEFAULT_ROOT_DIR}; use environment::RuntimeContext; @@ -108,6 +109,19 @@ pub fn get_config( set_network_config(&mut client_config.network, cli_args, &data_dir_ref)?; + // Parse custody mode from CLI flags + let is_supernode = parse_flag(cli_args, "supernode"); + let is_semi_supernode = parse_flag(cli_args, "semi-supernode"); + + client_config.chain.node_custody_type = if is_supernode { + client_config.network.subscribe_all_data_column_subnets = true; + NodeCustodyType::Supernode + } else if is_semi_supernode { + NodeCustodyType::SemiSupernode + } else { + NodeCustodyType::Fullnode + }; + /* * Staking flag * Note: the config values set here can be overwritten by other more specific cli params @@ -421,6 +435,7 @@ pub fn get_config( client_config.store.blob_prune_margin_epochs = blob_prune_margin_epochs; } + #[cfg(feature = "testing")] if let Some(malicious_withhold_count) = clap_utils::parse_optional(cli_args, "malicious-withhold-count")? { @@ -835,10 +850,12 @@ pub fn get_config( .max_gossip_aggregate_batch_size = clap_utils::parse_required(cli_args, "beacon-processor-aggregate-batch-size")?; + #[cfg(feature = "testing")] if let Some(delay) = clap_utils::parse_optional(cli_args, "delay-block-publishing")? { client_config.chain.block_publishing_delay = Some(Duration::from_secs_f64(delay)); } + #[cfg(feature = "testing")] if let Some(delay) = clap_utils::parse_optional(cli_args, "delay-data-column-publishing")? { client_config.chain.data_column_publishing_delay = Some(Duration::from_secs_f64(delay)); } @@ -1133,10 +1150,6 @@ pub fn set_network_config( config.network_dir = data_dir.join(DEFAULT_NETWORK_DIR); }; - if parse_flag(cli_args, "supernode") { - config.subscribe_all_data_column_subnets = true; - } - if parse_flag(cli_args, "subscribe-all-subnets") { config.subscribe_all_subnets = true; } @@ -1145,8 +1158,9 @@ pub fn set_network_config( config.import_all_attestations = true; } + #[cfg(feature = "testing")] if let Some(advertise_false_custody_group_count) = - parse_optional(cli_args, "advertise-false-custody-group-count")? + clap_utils::parse_optional(cli_args, "advertise-false-custody-group-count")? { config.advertise_false_custody_group_count = Some(advertise_false_custody_group_count); } diff --git a/beacon_node/store/src/database/leveldb_impl.rs b/beacon_node/store/src/database/leveldb_impl.rs index 385f35a33d..8fdd5812ea 100644 --- a/beacon_node/store/src/database/leveldb_impl.rs +++ b/beacon_node/store/src/database/leveldb_impl.rs @@ -282,7 +282,8 @@ impl LevelDB { ) -> Result<(), Error> { let mut leveldb_batch = Writebatch::new(); let iter = self.db.iter(self.read_options()); - + let start_key = BytesKey::from_vec(column.as_bytes().to_vec()); + iter.seek(&start_key); iter.take_while(move |(key, _)| key.matches_column(column)) .for_each(|(key, value)| { if f(&value).unwrap_or(false) { diff --git a/beacon_node/store/src/hot_cold_store.rs b/beacon_node/store/src/hot_cold_store.rs index 0d8a65e064..895afa4f33 100644 --- a/beacon_node/store/src/hot_cold_store.rs +++ b/beacon_node/store/src/hot_cold_store.rs @@ -949,6 +949,19 @@ impl, Cold: ItemStore> HotColdDB )); } + pub fn data_column_as_kv_store_ops( + &self, + block_root: &Hash256, + data_column: Arc>, + ops: &mut Vec, + ) { + ops.push(KeyValueStoreOp::PutKeyValue( + DBColumn::BeaconDataColumn, + get_data_column_key(block_root, &data_column.index), + data_column.as_ssz_bytes(), + )); + } + pub fn put_data_column_custody_info( &self, earliest_data_column_slot: Option, @@ -3178,13 +3191,14 @@ impl, Cold: ItemStore> HotColdDB self.try_prune_blobs(force, min_data_availability_boundary) } - /// Try to prune blobs older than the data availability boundary. + /// Try to prune blobs and data columns older than the data availability boundary. /// /// Blobs from the epoch `data_availability_boundary - blob_prune_margin_epochs` are retained. /// This epoch is an _exclusive_ endpoint for the pruning process. /// - /// This function only supports pruning blobs older than the split point, which is older than - /// (or equal to) finalization. Pruning blobs newer than finalization is not supported. + /// This function only supports pruning blobs and data columns older than the split point, + /// which is older than (or equal to) finalization. Pruning blobs and data columns newer than + /// finalization is not supported. /// /// This function also assumes that the split is stationary while it runs. It should only be /// run from the migrator thread (where `migrate_database` runs) or the database manager. @@ -3208,6 +3222,7 @@ impl, Cold: ItemStore> HotColdDB } let blob_info = self.get_blob_info(); + let data_column_info = self.get_data_column_info(); let Some(oldest_blob_slot) = blob_info.oldest_blob_slot else { error!("Slot of oldest blob is not known"); return Err(HotColdDBError::BlobPruneLogicError.into()); @@ -3306,13 +3321,7 @@ impl, Cold: ItemStore> HotColdDB } } - let new_blob_info = BlobInfo { - oldest_blob_slot: Some(end_slot + 1), - blobs_db: blob_info.blobs_db, - }; - - let op = self.compare_and_set_blob_info(blob_info, new_blob_info)?; - self.do_atomically_with_block_and_blobs_cache(vec![StoreOp::KeyValueOp(op)])?; + self.update_blob_or_data_column_info(start_epoch, end_slot, blob_info, data_column_info)?; debug!("Blob pruning complete"); @@ -3379,6 +3388,31 @@ impl, Cold: ItemStore> HotColdDB Ok(()) } + + fn update_blob_or_data_column_info( + &self, + start_epoch: Epoch, + end_slot: Slot, + blob_info: BlobInfo, + data_column_info: DataColumnInfo, + ) -> Result<(), Error> { + let op = if self.spec.is_peer_das_enabled_for_epoch(start_epoch) { + let new_data_column_info = DataColumnInfo { + oldest_data_column_slot: Some(end_slot + 1), + }; + self.compare_and_set_data_column_info(data_column_info, new_data_column_info)? + } else { + let new_blob_info = BlobInfo { + oldest_blob_slot: Some(end_slot + 1), + blobs_db: blob_info.blobs_db, + }; + self.compare_and_set_blob_info(blob_info, new_blob_info)? + }; + + self.do_atomically_with_block_and_blobs_cache(vec![StoreOp::KeyValueOp(op)])?; + + Ok(()) + } } /// Advance the split point of the store, copying new finalized states to the freezer. diff --git a/book/src/help_bn.md b/book/src/help_bn.md index 6680202a27..5f3c43a7e4 100644 --- a/book/src/help_bn.md +++ b/book/src/help_bn.md @@ -552,6 +552,12 @@ Flags: When present, Lighthouse will forget the payload statuses of any already-imported blocks. This can assist in the recovery from a consensus failure caused by the execution layer. + --semi-supernode + Run in minimal reconstruction mode. This node will subscribe to and + custody half of the data columns (enough for reconstruction), enabling + efficient data availability with lower bandwidth and storage + requirements compared to a supernode, while still supporting full blob + reconstruction. --shutdown-after-sync Shutdown beacon node as soon as sync is completed. Backfill sync will not be performed before shutdown. diff --git a/common/eip_3076/Cargo.toml b/common/eip_3076/Cargo.toml new file mode 100644 index 0000000000..851ef26238 --- /dev/null +++ b/common/eip_3076/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "eip_3076" +version = "0.1.0" +authors = ["Sigma Prime "] +edition = { workspace = true } + +[features] +default = [] +arbitrary-fuzz = ["dep:arbitrary", "types/arbitrary"] +json = ["dep:serde_json"] + +[dependencies] +arbitrary = { workspace = true, features = ["derive"], optional = true } +ethereum_serde_utils = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true, optional = true } +types = { workspace = true } + +[dev-dependencies] +tempfile = { workspace = true } diff --git a/validator_client/slashing_protection/src/interchange.rs b/common/eip_3076/src/lib.rs similarity index 64% rename from validator_client/slashing_protection/src/interchange.rs rename to common/eip_3076/src/lib.rs index 95a39c50e4..2d47a77de4 100644 --- a/validator_client/slashing_protection/src/interchange.rs +++ b/common/eip_3076/src/lib.rs @@ -1,10 +1,15 @@ -use crate::InterchangeError; use serde::{Deserialize, Serialize}; use std::cmp::max; use std::collections::{HashMap, HashSet}; +#[cfg(feature = "json")] use std::io; use types::{Epoch, Hash256, PublicKeyBytes, Slot}; +#[derive(Debug)] +pub enum Error { + MaxInconsistent, +} + #[derive(Debug, Clone, PartialEq, Deserialize, Serialize)] #[serde(deny_unknown_fields)] #[cfg_attr(feature = "arbitrary-fuzz", derive(arbitrary::Arbitrary))] @@ -53,10 +58,12 @@ pub struct Interchange { } impl Interchange { + #[cfg(feature = "json")] pub fn from_json_str(json: &str) -> Result { serde_json::from_str(json) } + #[cfg(feature = "json")] pub fn from_json_reader(mut reader: impl std::io::Read) -> Result { // We read the entire file into memory first, as this is *a lot* faster than using // `serde_json::from_reader`. See https://github.com/serde-rs/json/issues/160 @@ -65,6 +72,7 @@ impl Interchange { Ok(Interchange::from_json_str(&json_str)?) } + #[cfg(feature = "json")] pub fn write_to(&self, writer: impl std::io::Write) -> Result<(), serde_json::Error> { serde_json::to_writer(writer, self) } @@ -87,7 +95,7 @@ impl Interchange { } /// Minify an interchange by constructing a synthetic block & attestation for each validator. - pub fn minify(&self) -> Result { + pub fn minify(&self) -> Result { // Map from pubkey to optional max block and max attestation. let mut validator_data = HashMap::, Option)>::new(); @@ -124,7 +132,7 @@ impl Interchange { } } (None, None) => {} - _ => return Err(InterchangeError::MaxInconsistent), + _ => return Err(Error::MaxInconsistent), }; // Find maximum block slot. @@ -157,3 +165,96 @@ impl Interchange { }) } } + +#[cfg(feature = "json")] +#[cfg(test)] +mod tests { + use super::*; + use std::fs::File; + use tempfile::tempdir; + use types::FixedBytesExtended; + + fn get_interchange() -> Interchange { + Interchange { + metadata: InterchangeMetadata { + interchange_format_version: 5, + genesis_validators_root: Hash256::from_low_u64_be(555), + }, + data: vec![ + InterchangeData { + pubkey: PublicKeyBytes::deserialize(&[1u8; 48]).unwrap(), + signed_blocks: vec![SignedBlock { + slot: Slot::new(100), + signing_root: Some(Hash256::from_low_u64_be(1)), + }], + signed_attestations: vec![SignedAttestation { + source_epoch: Epoch::new(0), + target_epoch: Epoch::new(5), + signing_root: Some(Hash256::from_low_u64_be(2)), + }], + }, + InterchangeData { + pubkey: PublicKeyBytes::deserialize(&[2u8; 48]).unwrap(), + signed_blocks: vec![], + signed_attestations: vec![], + }, + ], + } + } + + #[test] + fn test_roundtrip() { + let temp_dir = tempdir().unwrap(); + let file_path = temp_dir.path().join("interchange.json"); + + let interchange = get_interchange(); + + let mut file = File::create(&file_path).unwrap(); + interchange.write_to(&mut file).unwrap(); + + let file = File::open(&file_path).unwrap(); + let from_file = Interchange::from_json_reader(file).unwrap(); + + assert_eq!(interchange, from_file); + } + + #[test] + fn test_empty_roundtrip() { + let temp_dir = tempdir().unwrap(); + let file_path = temp_dir.path().join("empty.json"); + + let empty = Interchange { + metadata: InterchangeMetadata { + interchange_format_version: 5, + genesis_validators_root: Hash256::zero(), + }, + data: vec![], + }; + + let mut file = File::create(&file_path).unwrap(); + empty.write_to(&mut file).unwrap(); + + let file = File::open(&file_path).unwrap(); + let from_file = Interchange::from_json_reader(file).unwrap(); + + assert_eq!(empty, from_file); + } + + #[test] + fn test_minify_roundtrip() { + let interchange = get_interchange(); + + let minified = interchange.minify().unwrap(); + + let temp_dir = tempdir().unwrap(); + let file_path = temp_dir.path().join("minified.json"); + + let mut file = File::create(&file_path).unwrap(); + minified.write_to(&mut file).unwrap(); + + let file = File::open(&file_path).unwrap(); + let from_file = Interchange::from_json_reader(file).unwrap(); + + assert_eq!(minified, from_file); + } +} diff --git a/common/eth2/Cargo.toml b/common/eth2/Cargo.toml index 81666a6421..46066a559f 100644 --- a/common/eth2/Cargo.toml +++ b/common/eth2/Cargo.toml @@ -10,6 +10,7 @@ lighthouse = [] [dependencies] derivative = { workspace = true } +eip_3076 = { workspace = true } either = { workspace = true } enr = { version = "0.13.0", features = ["ed25519"] } eth2_keystore = { workspace = true } @@ -29,7 +30,6 @@ reqwest-eventsource = "0.5.0" sensitive_url = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } -slashing_protection = { workspace = true } ssz_types = { workspace = true } test_random_derive = { path = "../../common/test_random_derive" } types = { workspace = true } diff --git a/common/eth2/src/lib.rs b/common/eth2/src/lib.rs index 0423794d0d..995e6966ea 100644 --- a/common/eth2/src/lib.rs +++ b/common/eth2/src/lib.rs @@ -943,7 +943,7 @@ impl BeaconNodeHttpClient { pub async fn get_beacon_states_pending_consolidations( &self, state_id: StateId, - ) -> Result>>, Error> + ) -> Result>>, Error> { let mut path = self.eth_path(V1)?; @@ -954,7 +954,9 @@ impl BeaconNodeHttpClient { .push(&state_id.to_string()) .push("pending_consolidations"); - self.get_opt(path).await + self.get_fork_contextual(path, |fork| fork) + .await + .map(|opt| opt.map(BeaconResponse::ForkVersioned)) } /// `GET beacon/light_client/updates` diff --git a/common/eth2/src/lighthouse/sync_state.rs b/common/eth2/src/lighthouse/sync_state.rs index 0327f7073f..9f6f3b52e0 100644 --- a/common/eth2/src/lighthouse/sync_state.rs +++ b/common/eth2/src/lighthouse/sync_state.rs @@ -15,6 +15,10 @@ pub enum SyncState { /// specified by its peers. Once completed, the node enters this sync state and attempts to /// download all required historical blocks. BackFillSyncing { completed: usize, remaining: usize }, + /// The node is undertaking a custody backfill sync. This occurs for a node that has completed forward and + /// backfill sync and has undergone a custody count change. During custody backfill sync the node attempts + /// to backfill its new column custody requirements up to the data availability window. + CustodyBackFillSyncing { completed: usize, remaining: usize }, /// The node has completed syncing a finalized chain and is in the process of re-evaluating /// which sync state to progress to. SyncTransition, @@ -39,6 +43,17 @@ pub enum BackFillState { Failed, } +#[derive(PartialEq, Debug, Clone, Serialize, Deserialize)] +/// The state of the custody backfill sync. +pub enum CustodyBackFillState { + /// We are currently backfilling custody columns. + Syncing, + /// A custody backfill sync has completed. + Completed, + /// A custody sync should is set to Pending for various reasons. + Pending(String), +} + impl PartialEq for SyncState { fn eq(&self, other: &Self) -> bool { matches!( @@ -54,6 +69,10 @@ impl PartialEq for SyncState { SyncState::BackFillSyncing { .. }, SyncState::BackFillSyncing { .. } ) + | ( + SyncState::CustodyBackFillSyncing { .. }, + SyncState::CustodyBackFillSyncing { .. } + ) ) } } @@ -65,8 +84,8 @@ impl SyncState { SyncState::SyncingFinalized { .. } => true, SyncState::SyncingHead { .. } => true, SyncState::SyncTransition => true, - // Backfill doesn't effect any logic, we consider this state, not syncing. - SyncState::BackFillSyncing { .. } => false, + // Both backfill and custody backfill don't effect any logic, we consider this state, not syncing. + SyncState::BackFillSyncing { .. } | SyncState::CustodyBackFillSyncing { .. } => false, SyncState::Synced => false, SyncState::Stalled => false, } @@ -77,7 +96,7 @@ impl SyncState { SyncState::SyncingFinalized { .. } => true, SyncState::SyncingHead { .. } => false, SyncState::SyncTransition => false, - SyncState::BackFillSyncing { .. } => false, + SyncState::BackFillSyncing { .. } | SyncState::CustodyBackFillSyncing { .. } => false, SyncState::Synced => false, SyncState::Stalled => false, } @@ -87,7 +106,12 @@ impl SyncState { /// /// NOTE: We consider the node synced if it is fetching old historical blocks. pub fn is_synced(&self) -> bool { - matches!(self, SyncState::Synced | SyncState::BackFillSyncing { .. }) + matches!( + self, + SyncState::Synced + | SyncState::BackFillSyncing { .. } + | SyncState::CustodyBackFillSyncing { .. } + ) } /// Returns true if the node is *stalled*, i.e. has no synced peers. @@ -108,6 +132,9 @@ impl std::fmt::Display for SyncState { SyncState::Stalled => write!(f, "Stalled"), SyncState::SyncTransition => write!(f, "Evaluating known peers"), SyncState::BackFillSyncing { .. } => write!(f, "Syncing Historical Blocks"), + SyncState::CustodyBackFillSyncing { .. } => { + write!(f, "Syncing Historical Data Columns") + } } } } diff --git a/common/eth2/src/lighthouse_vc/std_types.rs b/common/eth2/src/lighthouse_vc/std_types.rs index ae192312bd..0290bdd0b7 100644 --- a/common/eth2/src/lighthouse_vc/std_types.rs +++ b/common/eth2/src/lighthouse_vc/std_types.rs @@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize}; use types::{Address, Graffiti, PublicKeyBytes}; use zeroize::Zeroizing; -pub use slashing_protection::interchange::Interchange; +pub use eip_3076::Interchange; #[derive(Debug, Deserialize, Serialize, PartialEq)] pub struct GetFeeRecipientResponse { diff --git a/consensus/types/src/chain_spec.rs b/consensus/types/src/chain_spec.rs index 1e4c477386..069a63f511 100644 --- a/consensus/types/src/chain_spec.rs +++ b/consensus/types/src/chain_spec.rs @@ -483,15 +483,23 @@ impl ChainSpec { /// Returns a full `Fork` struct for a given epoch. pub fn fork_at_epoch(&self, epoch: Epoch) -> Fork { let current_fork_name = self.fork_name_at_epoch(epoch); - let previous_fork_name = current_fork_name.previous_fork().unwrap_or(ForkName::Base); - let epoch = self + + let fork_epoch = self .fork_epoch(current_fork_name) .unwrap_or_else(|| Epoch::new(0)); + // At genesis the Fork is initialised with two copies of the same value for both + // `previous_version` and `current_version` (see `initialize_beacon_state_from_eth1`). + let previous_fork_name = if fork_epoch == 0 { + current_fork_name + } else { + current_fork_name.previous_fork().unwrap_or(ForkName::Base) + }; + Fork { previous_version: self.fork_version_for_name(previous_fork_name), current_version: self.fork_version_for_name(current_fork_name), - epoch, + epoch: fork_epoch, } } @@ -3031,9 +3039,11 @@ mod yaml_tests { fn proposer_shuffling_decision_root_around_epoch_boundary() { type E = MainnetEthSpec; let fulu_fork_epoch = 5; + let gloas_fork_epoch = 10; let spec = { let mut spec = ForkName::Electra.make_genesis_spec(E::default_spec()); spec.fulu_fork_epoch = Some(Epoch::new(fulu_fork_epoch)); + spec.gloas_fork_epoch = Some(Epoch::new(gloas_fork_epoch)); Arc::new(spec) }; @@ -3047,7 +3057,7 @@ mod yaml_tests { } // For epochs after Fulu, the decision slot is the end of the epoch two epochs prior. - for epoch in ((fulu_fork_epoch + 1)..(fulu_fork_epoch + 10)).map(Epoch::new) { + for epoch in ((fulu_fork_epoch + 1)..=(gloas_fork_epoch + 1)).map(Epoch::new) { assert_eq!( spec.proposer_shuffling_decision_slot::(epoch), (epoch - 1).start_slot(E::slots_per_epoch()) - 1 diff --git a/consensus/types/src/contribution_and_proof.rs b/consensus/types/src/contribution_and_proof.rs index 85c9ac15fb..4d70cd1f8a 100644 --- a/consensus/types/src/contribution_and_proof.rs +++ b/consensus/types/src/contribution_and_proof.rs @@ -10,7 +10,6 @@ use test_random_derive::TestRandom; use tree_hash_derive::TreeHash; /// A Validators aggregate sync committee contribution and selection proof. - #[cfg_attr( feature = "arbitrary", derive(arbitrary::Arbitrary), diff --git a/consensus/types/src/slot_epoch.rs b/consensus/types/src/slot_epoch.rs index 857044f981..05af9c5232 100644 --- a/consensus/types/src/slot_epoch.rs +++ b/consensus/types/src/slot_epoch.rs @@ -33,6 +33,13 @@ pub struct Slot(#[serde(with = "serde_utils::quoted_u64")] u64); #[serde(transparent)] pub struct Epoch(#[serde(with = "serde_utils::quoted_u64")] u64); +impl Epoch { + /// Returns an iterator `(end..=start)` + pub fn range_inclusive_rev(start: Self, end: Self) -> impl Iterator { + (end.0..=start.0).rev().map(Epoch) + } +} + impl_common!(Slot); impl_common!(Epoch); diff --git a/lighthouse/Cargo.toml b/lighthouse/Cargo.toml index ef680c9b96..82bfc5056e 100644 --- a/lighthouse/Cargo.toml +++ b/lighthouse/Cargo.toml @@ -81,6 +81,7 @@ malloc_utils = { workspace = true, features = ["jemalloc"] } malloc_utils = { workspace = true, features = [] } [dev-dependencies] +beacon_node = { workspace = true, features = ["testing"] } beacon_node_fallback = { workspace = true } beacon_processor = { workspace = true } eth2 = { workspace = true } diff --git a/lighthouse/tests/beacon_node.rs b/lighthouse/tests/beacon_node.rs index 5a057d7d7f..8342b02173 100644 --- a/lighthouse/tests/beacon_node.rs +++ b/lighthouse/tests/beacon_node.rs @@ -4,6 +4,7 @@ use beacon_node::beacon_chain::chain_config::{ DEFAULT_RE_ORG_MAX_EPOCHS_SINCE_FINALIZATION, DEFAULT_SYNC_TOLERANCE_EPOCHS, DisallowedReOrgOffsets, }; +use beacon_node::beacon_chain::custody_context::NodeCustodyType; use beacon_node::{ ClientConfig as Config, beacon_chain::graffiti_calculator::GraffitiOrigin, beacon_chain::store::config::DatabaseBackend as BeaconNodeBackend, @@ -782,20 +783,38 @@ fn network_subscribe_all_data_column_subnets_flag() { CommandLineTest::new() .flag("subscribe-all-data-column-subnets", None) .run_with_zero_port() - .with_config(|config| assert!(config.network.subscribe_all_data_column_subnets)); + .with_config(|config| { + assert_eq!(config.chain.node_custody_type, NodeCustodyType::Supernode) + }); } #[test] fn network_supernode_flag() { CommandLineTest::new() .flag("supernode", None) .run_with_zero_port() - .with_config(|config| assert!(config.network.subscribe_all_data_column_subnets)); + .with_config(|config| { + assert_eq!(config.chain.node_custody_type, NodeCustodyType::Supernode) + }); } #[test] -fn network_subscribe_all_data_column_subnets_default() { +fn network_semi_supernode_flag() { + CommandLineTest::new() + .flag("semi-supernode", None) + .run_with_zero_port() + .with_config(|config| { + assert_eq!( + config.chain.node_custody_type, + NodeCustodyType::SemiSupernode + ) + }); +} +#[test] +fn network_node_custody_type_default() { CommandLineTest::new() .run_with_zero_port() - .with_config(|config| assert!(!config.network.subscribe_all_data_column_subnets)); + .with_config(|config| { + assert_eq!(config.chain.node_custody_type, NodeCustodyType::Fullnode) + }); } #[test] fn blob_publication_batches() { diff --git a/scripts/tests/checkpoint-sync.sh b/scripts/tests/checkpoint-sync.sh index df03da042e..605dc504f5 100755 --- a/scripts/tests/checkpoint-sync.sh +++ b/scripts/tests/checkpoint-sync.sh @@ -102,7 +102,8 @@ node_completed["fullnode"]=false echo "Polling sync status until backfill reaches ${TARGET_BACKFILL_SLOTS} slots or timeout of ${TIMEOUT_MINS} mins" -while [ "${node_completed[supernode]}" = false ] || [ "${node_completed[fullnode]}" = false ]; do +# while [ "${node_completed[supernode]}" = false ] || [ "${node_completed[fullnode]}" = false ]; do +while [ "${node_completed[fullnode]}" = false ]; do current_time=$(date +%s) elapsed=$((current_time - start_time)) @@ -112,7 +113,8 @@ while [ "${node_completed[supernode]}" = false ] || [ "${node_completed[fullnode fi # Poll each node that hasn't completed yet - for node in "supernode" "fullnode"; do + # for node in "supernode" "fullnode"; do + for node in "fullnode"; do if [ "${node_completed[$node]}" = false ]; then poll_node "$node" fi @@ -121,7 +123,7 @@ while [ "${node_completed[supernode]}" = false ] || [ "${node_completed[fullnode sleep $POLL_INTERVAL_SECS done -echo "Sync test complete! Both supernode and fullnode have synced to HEAD and backfilled ${TARGET_BACKFILL_SLOTS} slots." -echo "Supernode time: $((node_complete_time[supernode] - start_time)) seconds" +echo "Sync test complete! Fullnode has synced to HEAD and backfilled ${TARGET_BACKFILL_SLOTS} slots." +# echo "Supernode time: $((node_complete_time[supernode] - start_time)) seconds" echo "Fullnode time: $((node_complete_time[fullnode] - start_time)) seconds" exit_and_dump_logs 0 \ No newline at end of file diff --git a/testing/ef_tests/src/cases/fork_choice.rs b/testing/ef_tests/src/cases/fork_choice.rs index 1380e44acd..47b9902345 100644 --- a/testing/ef_tests/src/cases/fork_choice.rs +++ b/testing/ef_tests/src/cases/fork_choice.rs @@ -16,6 +16,7 @@ use beacon_chain::{ VerifiedAttestation, obtain_indexed_attestation_and_committees_per_slot, }, blob_verification::GossipVerifiedBlob, + custody_context::NodeCustodyType, test_utils::{BeaconChainHarness, EphemeralHarnessType}, }; use execution_layer::{PayloadStatusV1, json_structures::JsonPayloadStatusV1Status}; @@ -436,7 +437,7 @@ impl Tester { .genesis_state_ephemeral_store(case.anchor_state.clone()) .mock_execution_layer() .recalculate_fork_times_with_genesis(0) - .import_all_data_columns(true) + .node_custody_type(NodeCustodyType::Supernode) .mock_execution_layer_all_payloads_valid() .build(); diff --git a/validator_client/slashing_protection/Cargo.toml b/validator_client/slashing_protection/Cargo.toml index 3860af514d..6a778c5de3 100644 --- a/validator_client/slashing_protection/Cargo.toml +++ b/validator_client/slashing_protection/Cargo.toml @@ -6,11 +6,12 @@ edition = { workspace = true } autotests = false [features] -arbitrary-fuzz = ["types/arbitrary-fuzz"] +arbitrary-fuzz = ["types/arbitrary-fuzz", "eip_3076/arbitrary-fuzz"] portable = ["types/portable"] [dependencies] arbitrary = { workspace = true, features = ["derive"] } +eip_3076 = { workspace = true, features = ["json"] } ethereum_serde_utils = { workspace = true } filesystem = { workspace = true } r2d2 = { workspace = true } diff --git a/validator_client/slashing_protection/src/bin/test_generator.rs b/validator_client/slashing_protection/src/bin/test_generator.rs index 4576231b7b..dfda7983f7 100644 --- a/validator_client/slashing_protection/src/bin/test_generator.rs +++ b/validator_client/slashing_protection/src/bin/test_generator.rs @@ -1,7 +1,5 @@ +use eip_3076::{Interchange, InterchangeData, InterchangeMetadata, SignedAttestation, SignedBlock}; use slashing_protection::SUPPORTED_INTERCHANGE_FORMAT_VERSION; -use slashing_protection::interchange::{ - Interchange, InterchangeData, InterchangeMetadata, SignedAttestation, SignedBlock, -}; use slashing_protection::interchange_test::{MultiTestCase, TestCase}; use slashing_protection::test_utils::{DEFAULT_GENESIS_VALIDATORS_ROOT, pubkey}; use std::fs::{self, File}; diff --git a/validator_client/slashing_protection/src/interchange_test.rs b/validator_client/slashing_protection/src/interchange_test.rs index 1bc4326b4f..ebe0105f24 100644 --- a/validator_client/slashing_protection/src/interchange_test.rs +++ b/validator_client/slashing_protection/src/interchange_test.rs @@ -1,8 +1,8 @@ use crate::{ SigningRoot, SlashingDatabase, - interchange::{Interchange, SignedAttestation, SignedBlock}, test_utils::{DEFAULT_GENESIS_VALIDATORS_ROOT, pubkey}, }; +use eip_3076::{Interchange, SignedAttestation, SignedBlock}; use serde::{Deserialize, Serialize}; use std::collections::HashSet; use tempfile::tempdir; diff --git a/validator_client/slashing_protection/src/lib.rs b/validator_client/slashing_protection/src/lib.rs index ded64adb49..917d51d38b 100644 --- a/validator_client/slashing_protection/src/lib.rs +++ b/validator_client/slashing_protection/src/lib.rs @@ -1,7 +1,6 @@ mod attestation_tests; mod block_tests; mod extra_interchange_tests; -pub mod interchange; pub mod interchange_test; mod parallel_tests; mod registration_tests; @@ -10,6 +9,10 @@ mod signed_block; mod slashing_database; pub mod test_utils; +pub mod interchange { + pub use eip_3076::{Interchange, InterchangeMetadata}; +} + pub use crate::signed_attestation::{InvalidAttestation, SignedAttestation}; pub use crate::signed_block::{InvalidBlock, SignedBlock}; pub use crate::slashing_database::{ diff --git a/validator_client/slashing_protection/src/slashing_database.rs b/validator_client/slashing_protection/src/slashing_database.rs index 7d8947a584..ce32299a51 100644 --- a/validator_client/slashing_protection/src/slashing_database.rs +++ b/validator_client/slashing_protection/src/slashing_database.rs @@ -1,10 +1,10 @@ -use crate::interchange::{ - Interchange, InterchangeData, InterchangeMetadata, SignedAttestation as InterchangeAttestation, - SignedBlock as InterchangeBlock, -}; use crate::signed_attestation::InvalidAttestation; use crate::signed_block::InvalidBlock; use crate::{NotSafe, Safe, SignedAttestation, SignedBlock, SigningRoot, signing_root_from_row}; +use eip_3076::{ + Interchange, InterchangeData, InterchangeMetadata, SignedAttestation as InterchangeAttestation, + SignedBlock as InterchangeBlock, +}; use filesystem::restrict_file_permissions; use r2d2_sqlite::SqliteConnectionManager; use rusqlite::{OptionalExtension, Transaction, TransactionBehavior, params}; @@ -1219,7 +1219,7 @@ pub enum InterchangeError { interchange_file: Hash256, client: Hash256, }, - MaxInconsistent, + Eip3076(eip_3076::Error), SummaryInconsistent, SQLError(String), SQLPoolError(r2d2::Error),