mirror of
https://github.com/sigp/lighthouse.git
synced 2026-03-05 17:51:41 +00:00
Enable reconstruction for nodes custodying more than 50% of columns and instrument tracing (#8052)
Co-Authored-By: Jimmy Chen <jchen.tc@gmail.com> Co-Authored-By: Jimmy Chen <jimmy@sigmaprime.io>
This commit is contained in:
@@ -3299,10 +3299,14 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
|
||||
|
||||
let data_availability_checker = self.data_availability_checker.clone();
|
||||
|
||||
let current_span = Span::current();
|
||||
let result = self
|
||||
.task_executor
|
||||
.spawn_blocking_handle(
|
||||
move || data_availability_checker.reconstruct_data_columns(&block_root),
|
||||
move || {
|
||||
let _guard = current_span.enter();
|
||||
data_availability_checker.reconstruct_data_columns(&block_root)
|
||||
},
|
||||
"reconstruct_data_columns",
|
||||
)
|
||||
.ok_or(BeaconChainError::RuntimeShutdown)?
|
||||
|
||||
@@ -547,6 +547,7 @@ impl<T: BeaconChainTypes> DataAvailabilityChecker<T> {
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(skip_all, level = "debug")]
|
||||
pub fn reconstruct_data_columns(
|
||||
&self,
|
||||
block_root: &Hash256,
|
||||
|
||||
@@ -592,9 +592,9 @@ impl<T: BeaconChainTypes> DataAvailabilityCheckerInner<T> {
|
||||
|
||||
/// Check whether data column reconstruction should be attempted.
|
||||
///
|
||||
/// Potentially trigger reconstruction if:
|
||||
/// - Our custody requirement is all columns (supernode), and we haven't got all columns
|
||||
/// - We have >= 50% of columns, but not all columns
|
||||
/// Potentially trigger reconstruction if all the following satisfy:
|
||||
/// - Our custody requirement is more than 50% of total columns,
|
||||
/// - We haven't received all required columns
|
||||
/// - Reconstruction hasn't been started for the block
|
||||
///
|
||||
/// If reconstruction is required, returns `PendingComponents` which contains the
|
||||
@@ -609,15 +609,25 @@ impl<T: BeaconChainTypes> DataAvailabilityCheckerInner<T> {
|
||||
return ReconstructColumnsDecision::No("block already imported");
|
||||
};
|
||||
|
||||
// If we're sampling all columns, it means we must be custodying all columns.
|
||||
let Some(epoch) = pending_components
|
||||
.verified_data_columns
|
||||
.first()
|
||||
.map(|c| c.as_data_column().epoch())
|
||||
else {
|
||||
return ReconstructColumnsDecision::No("not enough columns");
|
||||
};
|
||||
|
||||
let total_column_count = T::EthSpec::number_of_columns();
|
||||
let sampling_column_count = self
|
||||
.custody_context
|
||||
.num_of_data_columns_to_sample(epoch, &self.spec);
|
||||
let received_column_count = pending_components.verified_data_columns.len();
|
||||
|
||||
if pending_components.reconstruction_started {
|
||||
return ReconstructColumnsDecision::No("already started");
|
||||
}
|
||||
if received_column_count >= total_column_count {
|
||||
return ReconstructColumnsDecision::No("all columns received");
|
||||
if received_column_count >= sampling_column_count {
|
||||
return ReconstructColumnsDecision::No("all sampling columns received");
|
||||
}
|
||||
if received_column_count < total_column_count / 2 {
|
||||
return ReconstructColumnsDecision::No("not enough columns");
|
||||
|
||||
@@ -130,7 +130,7 @@ pub struct CustodyContext<E: EthSpec> {
|
||||
/// and enr values.
|
||||
validator_custody_count: AtomicU64,
|
||||
/// Is the node run as a supernode based on current cli parameters.
|
||||
pub current_is_supernode: bool,
|
||||
current_is_supernode: bool,
|
||||
/// The persisted value for `is_supernode` based on the previous run of this node.
|
||||
///
|
||||
/// Note: We require this value because if a user restarts the node with a higher cli custody
|
||||
@@ -307,6 +307,14 @@ impl<E: EthSpec> CustodyContext<E> {
|
||||
.expect("should compute node sampling size from valid chain spec")
|
||||
}
|
||||
|
||||
/// Returns whether the node should attempt reconstruction at a given epoch.
|
||||
pub fn should_attempt_reconstruction(&self, epoch: Epoch, spec: &ChainSpec) -> bool {
|
||||
let min_columns_for_reconstruction = E::number_of_columns() / 2;
|
||||
// performing reconstruction is not necessary if sampling column count is exactly 50%,
|
||||
// because the node doesn't need the remaining columns.
|
||||
self.num_of_data_columns_to_sample(epoch, spec) > min_columns_for_reconstruction
|
||||
}
|
||||
|
||||
/// Returns the ordered list of column indices that should be sampled for data availability checking at the given epoch.
|
||||
///
|
||||
/// # Parameters
|
||||
|
||||
@@ -34,7 +34,6 @@ use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
|
||||
use store::hot_cold_store::HotColdDBError;
|
||||
use tokio::sync::mpsc::error::TrySendError;
|
||||
use tracing::{Instrument, Span, debug, error, info, instrument, trace, warn};
|
||||
use types::{
|
||||
Attestation, AttestationData, AttestationRef, AttesterSlashing, BlobSidecar, DataColumnSidecar,
|
||||
@@ -1054,36 +1053,43 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
|
||||
"Processed data column, waiting for other components"
|
||||
);
|
||||
|
||||
// Instead of triggering reconstruction immediately, schedule it to be run. If
|
||||
// another column arrives it either completes availability or pushes
|
||||
// reconstruction back a bit.
|
||||
let cloned_self = Arc::clone(self);
|
||||
let block_root = *block_root;
|
||||
let send_result = self.beacon_processor_send.try_send(WorkEvent {
|
||||
drop_during_sync: false,
|
||||
work: Work::Reprocess(ReprocessQueueMessage::DelayColumnReconstruction(
|
||||
QueuedColumnReconstruction {
|
||||
block_root,
|
||||
slot: *slot,
|
||||
process_fn: Box::pin(async move {
|
||||
cloned_self
|
||||
.attempt_data_column_reconstruction(block_root)
|
||||
.await;
|
||||
}),
|
||||
},
|
||||
)),
|
||||
});
|
||||
if let Err(TrySendError::Full(WorkEvent {
|
||||
work:
|
||||
Work::Reprocess(ReprocessQueueMessage::DelayColumnReconstruction(
|
||||
reconstruction,
|
||||
)),
|
||||
..
|
||||
})) = send_result
|
||||
if self
|
||||
.chain
|
||||
.data_availability_checker
|
||||
.custody_context()
|
||||
.should_attempt_reconstruction(
|
||||
slot.epoch(T::EthSpec::slots_per_epoch()),
|
||||
&self.chain.spec,
|
||||
)
|
||||
{
|
||||
warn!("Unable to send reconstruction to reprocessing");
|
||||
// Execute it immediately instead.
|
||||
reconstruction.process_fn.await;
|
||||
// Instead of triggering reconstruction immediately, schedule it to be run. If
|
||||
// another column arrives, it either completes availability or pushes
|
||||
// reconstruction back a bit.
|
||||
let cloned_self = Arc::clone(self);
|
||||
let block_root = *block_root;
|
||||
|
||||
if self
|
||||
.beacon_processor_send
|
||||
.try_send(WorkEvent {
|
||||
drop_during_sync: false,
|
||||
work: Work::Reprocess(
|
||||
ReprocessQueueMessage::DelayColumnReconstruction(
|
||||
QueuedColumnReconstruction {
|
||||
block_root,
|
||||
slot: *slot,
|
||||
process_fn: Box::pin(async move {
|
||||
cloned_self
|
||||
.attempt_data_column_reconstruction(block_root)
|
||||
.await;
|
||||
}),
|
||||
},
|
||||
),
|
||||
),
|
||||
})
|
||||
.is_err()
|
||||
{
|
||||
warn!("Unable to send reconstruction to reprocessing");
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
@@ -28,7 +28,7 @@ use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use task_executor::TaskExecutor;
|
||||
use tokio::sync::mpsc::{self, error::TrySendError};
|
||||
use tracing::{debug, error, trace, warn};
|
||||
use tracing::{debug, error, instrument, trace, warn};
|
||||
use types::*;
|
||||
|
||||
pub use sync_methods::ChainSegmentProcessId;
|
||||
@@ -825,30 +825,12 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Attempt to reconstruct all data columns if the following conditions satisfies:
|
||||
/// - Our custody requirement is all columns
|
||||
/// - We have >= 50% of columns, but not all columns
|
||||
///
|
||||
/// Returns `Some(AvailabilityProcessingStatus)` if reconstruction is successfully performed,
|
||||
/// otherwise returns `None`.
|
||||
///
|
||||
/// The `publish_columns` parameter controls whether reconstructed columns should be published
|
||||
/// to the gossip network.
|
||||
async fn attempt_data_column_reconstruction(
|
||||
self: &Arc<Self>,
|
||||
block_root: Hash256,
|
||||
) -> Option<AvailabilityProcessingStatus> {
|
||||
// Only supernodes attempt reconstruction
|
||||
if !self
|
||||
.chain
|
||||
.data_availability_checker
|
||||
.custody_context()
|
||||
.current_is_supernode
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
/// Attempts to reconstruct all data columns if the conditions checked in
|
||||
/// [`DataAvailabilityCheckerInner::check_and_set_reconstruction_started`] are satisfied.
|
||||
#[instrument(level = "debug", skip_all, fields(?block_root))]
|
||||
async fn attempt_data_column_reconstruction(self: &Arc<Self>, block_root: Hash256) {
|
||||
let result = self.chain.reconstruct_data_columns(block_root).await;
|
||||
|
||||
match result {
|
||||
Ok(Some((availability_processing_status, data_columns_to_publish))) => {
|
||||
self.publish_data_columns_gradually(data_columns_to_publish, block_root);
|
||||
@@ -864,21 +846,18 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
|
||||
AvailabilityProcessingStatus::MissingComponents(_, _) => {
|
||||
debug!(
|
||||
result = "imported all custody columns",
|
||||
block_hash = %block_root,
|
||||
%block_root,
|
||||
"Block components still missing block after reconstruction"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Some(availability_processing_status)
|
||||
}
|
||||
Ok(None) => {
|
||||
// reason is tracked via the `KZG_DATA_COLUMN_RECONSTRUCTION_INCOMPLETE_TOTAL` metric
|
||||
trace!(
|
||||
block_hash = %block_root,
|
||||
%block_root,
|
||||
"Reconstruction not required for block"
|
||||
);
|
||||
None
|
||||
}
|
||||
Err(e) => {
|
||||
error!(
|
||||
@@ -886,7 +865,6 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
|
||||
error = ?e,
|
||||
"Error during data column reconstruction"
|
||||
);
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -975,6 +953,7 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
|
||||
/// by some nodes on the network as soon as possible. Our hope is that some columns arrive from
|
||||
/// other nodes in the meantime, obviating the need for us to publish them. If no other
|
||||
/// publisher exists for a column, it will eventually get published here.
|
||||
#[instrument(level="debug", skip_all, fields(?block_root, data_column_count=data_columns_to_publish.len()))]
|
||||
fn publish_data_columns_gradually(
|
||||
self: &Arc<Self>,
|
||||
mut data_columns_to_publish: DataColumnSidecarList<T::EthSpec>,
|
||||
|
||||
@@ -1009,10 +1009,6 @@ async fn import_gossip_block_acceptably_early() {
|
||||
rig.assert_event_journal_completes(&[WorkType::GossipDataColumnSidecar])
|
||||
.await;
|
||||
}
|
||||
if num_data_columns > 0 {
|
||||
rig.assert_event_journal_completes(&[WorkType::ColumnReconstruction])
|
||||
.await;
|
||||
}
|
||||
|
||||
// Note: this section of the code is a bit race-y. We're assuming that we can set the slot clock
|
||||
// and check the head in the time between the block arrived early and when its due for
|
||||
|
||||
Reference in New Issue
Block a user