mirror of
https://github.com/sigp/lighthouse.git
synced 2026-03-20 21:34:46 +00:00
Refactor data column reconstruction and avoid blocking processing (#6403)
* Move reconstruction logic out of `overflow_lru_cache` to simplify the code and avoids having to pass `DataColumnsToPublish` around and blocking other processing. * Publish reconstructed cells before recomputing head. Remove duplicate functions. * Merge branch 'unstable' into non-blocking-reconstruction * Merge branch 'unstable' into non-blocking-reconstruction # Conflicts: # beacon_node/beacon_chain/src/beacon_chain.rs # beacon_node/beacon_chain/src/data_availability_checker.rs # beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs # beacon_node/network/src/network_beacon_processor/sync_methods.rs * Spawn a blocking task for reconstruction. * Merge branch 'unstable' into non-blocking-reconstruction # Conflicts: # beacon_node/network/src/network_beacon_processor/mod.rs * Fix fmt * Merge branch 'unstable' into non-blocking-reconstruction # Conflicts: # beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs * Fix race condition by making check and mutation atomic as suggested by Lion. Also added error handling to reconstruction failure. * Add reconstruction reason metric and more debug logging to da checker. * Add comment and logging. * Rename `NotRequired` to `NotStarted`. * Remove extra character added.
This commit is contained in:
@@ -2,10 +2,12 @@ use crate::blob_verification::{verify_kzg_for_blob_list, GossipVerifiedBlob, Kzg
|
||||
use crate::block_verification_types::{
|
||||
AvailabilityPendingExecutedBlock, AvailableExecutedBlock, RpcBlock,
|
||||
};
|
||||
use crate::data_availability_checker::overflow_lru_cache::DataAvailabilityCheckerInner;
|
||||
use crate::{BeaconChain, BeaconChainTypes, BeaconStore};
|
||||
use crate::data_availability_checker::overflow_lru_cache::{
|
||||
DataAvailabilityCheckerInner, ReconstructColumnsDecision,
|
||||
};
|
||||
use crate::{metrics, BeaconChain, BeaconChainTypes, BeaconStore};
|
||||
use kzg::Kzg;
|
||||
use slog::{debug, error};
|
||||
use slog::{debug, error, Logger};
|
||||
use slot_clock::SlotClock;
|
||||
use std::fmt;
|
||||
use std::fmt::Debug;
|
||||
@@ -27,11 +29,12 @@ use crate::data_column_verification::{
|
||||
verify_kzg_for_data_column, verify_kzg_for_data_column_list, CustodyDataColumn,
|
||||
GossipVerifiedDataColumn, KzgVerifiedCustodyDataColumn, KzgVerifiedDataColumn,
|
||||
};
|
||||
use crate::metrics::{
|
||||
KZG_DATA_COLUMN_RECONSTRUCTION_ATTEMPTS, KZG_DATA_COLUMN_RECONSTRUCTION_FAILURES,
|
||||
};
|
||||
pub use error::{Error as AvailabilityCheckError, ErrorCategory as AvailabilityCheckErrorCategory};
|
||||
use types::non_zero_usize::new_non_zero_usize;
|
||||
|
||||
pub use self::overflow_lru_cache::DataColumnsToPublish;
|
||||
|
||||
/// The LRU Cache stores `PendingComponents` which can store up to
|
||||
/// `MAX_BLOBS_PER_BLOCK = 6` blobs each. A `BlobSidecar` is 0.131256 MB. So
|
||||
/// the maximum size of a `PendingComponents` is ~ 0.787536 MB. Setting this
|
||||
@@ -71,6 +74,16 @@ pub struct DataAvailabilityChecker<T: BeaconChainTypes> {
|
||||
slot_clock: T::SlotClock,
|
||||
kzg: Arc<Kzg>,
|
||||
spec: Arc<ChainSpec>,
|
||||
log: Logger,
|
||||
}
|
||||
|
||||
pub type AvailabilityAndReconstructedColumns<E> = (Availability<E>, DataColumnSidecarList<E>);
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum DataColumnReconstructionResult<E: EthSpec> {
|
||||
Success(AvailabilityAndReconstructedColumns<E>),
|
||||
NotStarted(&'static str),
|
||||
RecoveredColumnsNotImported(&'static str),
|
||||
}
|
||||
|
||||
/// This type is returned after adding a block / blob to the `DataAvailabilityChecker`.
|
||||
@@ -101,6 +114,7 @@ impl<T: BeaconChainTypes> DataAvailabilityChecker<T> {
|
||||
store: BeaconStore<T>,
|
||||
import_all_data_columns: bool,
|
||||
spec: Arc<ChainSpec>,
|
||||
log: Logger,
|
||||
) -> Result<Self, AvailabilityCheckError> {
|
||||
let custody_subnet_count = if import_all_data_columns {
|
||||
spec.data_column_sidecar_subnet_count as usize
|
||||
@@ -124,6 +138,7 @@ impl<T: BeaconChainTypes> DataAvailabilityChecker<T> {
|
||||
slot_clock,
|
||||
kzg,
|
||||
spec,
|
||||
log,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -205,7 +220,7 @@ impl<T: BeaconChainTypes> DataAvailabilityChecker<T> {
|
||||
.map_err(AvailabilityCheckError::InvalidBlobs)?;
|
||||
|
||||
self.availability_cache
|
||||
.put_kzg_verified_blobs(block_root, epoch, verified_blobs)
|
||||
.put_kzg_verified_blobs(block_root, epoch, verified_blobs, &self.log)
|
||||
}
|
||||
|
||||
/// Put a list of custody columns received via RPC into the availability cache. This performs KZG
|
||||
@@ -216,8 +231,7 @@ impl<T: BeaconChainTypes> DataAvailabilityChecker<T> {
|
||||
block_root: Hash256,
|
||||
epoch: Epoch,
|
||||
custody_columns: DataColumnSidecarList<T::EthSpec>,
|
||||
) -> Result<(Availability<T::EthSpec>, DataColumnsToPublish<T::EthSpec>), AvailabilityCheckError>
|
||||
{
|
||||
) -> Result<Availability<T::EthSpec>, AvailabilityCheckError> {
|
||||
// TODO(das): report which column is invalid for proper peer scoring
|
||||
// TODO(das): batch KZG verification here, but fallback into checking each column
|
||||
// individually to report which column(s) are invalid.
|
||||
@@ -233,10 +247,10 @@ impl<T: BeaconChainTypes> DataAvailabilityChecker<T> {
|
||||
.collect::<Result<Vec<_>, AvailabilityCheckError>>()?;
|
||||
|
||||
self.availability_cache.put_kzg_verified_data_columns(
|
||||
&self.kzg,
|
||||
block_root,
|
||||
epoch,
|
||||
verified_custody_columns,
|
||||
&self.log,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -253,6 +267,7 @@ impl<T: BeaconChainTypes> DataAvailabilityChecker<T> {
|
||||
gossip_blob.block_root(),
|
||||
gossip_blob.epoch(),
|
||||
vec![gossip_blob.into_inner()],
|
||||
&self.log,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -267,8 +282,7 @@ impl<T: BeaconChainTypes> DataAvailabilityChecker<T> {
|
||||
slot: Slot,
|
||||
block_root: Hash256,
|
||||
gossip_data_columns: Vec<GossipVerifiedDataColumn<T>>,
|
||||
) -> Result<(Availability<T::EthSpec>, DataColumnsToPublish<T::EthSpec>), AvailabilityCheckError>
|
||||
{
|
||||
) -> Result<Availability<T::EthSpec>, AvailabilityCheckError> {
|
||||
let epoch = slot.epoch(T::EthSpec::slots_per_epoch());
|
||||
|
||||
let custody_columns = gossip_data_columns
|
||||
@@ -277,10 +291,10 @@ impl<T: BeaconChainTypes> DataAvailabilityChecker<T> {
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
self.availability_cache.put_kzg_verified_data_columns(
|
||||
&self.kzg,
|
||||
block_root,
|
||||
epoch,
|
||||
custody_columns,
|
||||
&self.log,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -291,7 +305,7 @@ impl<T: BeaconChainTypes> DataAvailabilityChecker<T> {
|
||||
executed_block: AvailabilityPendingExecutedBlock<T::EthSpec>,
|
||||
) -> Result<Availability<T::EthSpec>, AvailabilityCheckError> {
|
||||
self.availability_cache
|
||||
.put_pending_executed_block(executed_block)
|
||||
.put_pending_executed_block(executed_block, &self.log)
|
||||
}
|
||||
|
||||
pub fn remove_pending_components(&self, block_root: Hash256) {
|
||||
@@ -511,6 +525,92 @@ impl<T: BeaconChainTypes> DataAvailabilityChecker<T> {
|
||||
block_cache_size: self.availability_cache.block_cache_size(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn reconstruct_data_columns(
|
||||
&self,
|
||||
block_root: &Hash256,
|
||||
) -> Result<DataColumnReconstructionResult<T::EthSpec>, AvailabilityCheckError> {
|
||||
let pending_components = match self
|
||||
.availability_cache
|
||||
.check_and_set_reconstruction_started(block_root)
|
||||
{
|
||||
ReconstructColumnsDecision::Yes(pending_components) => pending_components,
|
||||
ReconstructColumnsDecision::No(reason) => {
|
||||
return Ok(DataColumnReconstructionResult::NotStarted(reason));
|
||||
}
|
||||
};
|
||||
|
||||
metrics::inc_counter(&KZG_DATA_COLUMN_RECONSTRUCTION_ATTEMPTS);
|
||||
let timer = metrics::start_timer(&metrics::DATA_AVAILABILITY_RECONSTRUCTION_TIME);
|
||||
|
||||
let all_data_columns = KzgVerifiedCustodyDataColumn::reconstruct_columns(
|
||||
&self.kzg,
|
||||
&pending_components.verified_data_columns,
|
||||
&self.spec,
|
||||
)
|
||||
.map_err(|e| {
|
||||
error!(
|
||||
self.log,
|
||||
"Error reconstructing data columns";
|
||||
"block_root" => ?block_root,
|
||||
"error" => ?e
|
||||
);
|
||||
self.availability_cache
|
||||
.handle_reconstruction_failure(block_root);
|
||||
metrics::inc_counter(&KZG_DATA_COLUMN_RECONSTRUCTION_FAILURES);
|
||||
AvailabilityCheckError::ReconstructColumnsError(e)
|
||||
})?;
|
||||
|
||||
// Check indices from cache again to make sure we don't publish components we've already received.
|
||||
let Some(existing_column_indices) = self.cached_data_column_indexes(block_root) else {
|
||||
return Ok(DataColumnReconstructionResult::RecoveredColumnsNotImported(
|
||||
"block already imported",
|
||||
));
|
||||
};
|
||||
|
||||
let data_columns_to_publish = all_data_columns
|
||||
.into_iter()
|
||||
.filter(|d| !existing_column_indices.contains(&d.index()))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let Some(slot) = data_columns_to_publish
|
||||
.first()
|
||||
.map(|d| d.as_data_column().slot())
|
||||
else {
|
||||
return Ok(DataColumnReconstructionResult::RecoveredColumnsNotImported(
|
||||
"No new columns to import and publish",
|
||||
));
|
||||
};
|
||||
|
||||
metrics::stop_timer(timer);
|
||||
metrics::inc_counter_by(
|
||||
&metrics::DATA_AVAILABILITY_RECONSTRUCTED_COLUMNS,
|
||||
data_columns_to_publish.len() as u64,
|
||||
);
|
||||
|
||||
debug!(self.log, "Reconstructed columns";
|
||||
"count" => data_columns_to_publish.len(),
|
||||
"block_root" => ?block_root,
|
||||
"slot" => slot,
|
||||
);
|
||||
|
||||
self.availability_cache
|
||||
.put_kzg_verified_data_columns(
|
||||
*block_root,
|
||||
slot.epoch(T::EthSpec::slots_per_epoch()),
|
||||
data_columns_to_publish.clone(),
|
||||
&self.log,
|
||||
)
|
||||
.map(|availability| {
|
||||
DataColumnReconstructionResult::Success((
|
||||
availability,
|
||||
data_columns_to_publish
|
||||
.into_iter()
|
||||
.map(|d| d.clone_arc())
|
||||
.collect::<Vec<_>>(),
|
||||
))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper struct to group data availability checker metrics.
|
||||
|
||||
Reference in New Issue
Block a user