Refactor data column reconstruction and avoid blocking processing (#6403)

* Move reconstruction logic out of `overflow_lru_cache` to simplify the code and avoids having to pass `DataColumnsToPublish` around and blocking other processing.

* Publish reconstructed cells before recomputing head. Remove duplicate functions.

* Merge branch 'unstable' into non-blocking-reconstruction

* Merge branch 'unstable' into non-blocking-reconstruction

# Conflicts:
#	beacon_node/beacon_chain/src/beacon_chain.rs
#	beacon_node/beacon_chain/src/data_availability_checker.rs
#	beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs
#	beacon_node/network/src/network_beacon_processor/sync_methods.rs

* Spawn a blocking task for reconstruction.

* Merge branch 'unstable' into non-blocking-reconstruction

# Conflicts:
#	beacon_node/network/src/network_beacon_processor/mod.rs

* Fix fmt

* Merge branch 'unstable' into non-blocking-reconstruction

# Conflicts:
#	beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs

* Fix race condition by making check and mutation atomic as suggested by Lion. Also added error handling to reconstruction failure.

* Add reconstruction reason metric and more debug logging to da checker.

* Add comment and logging.

* Rename `NotRequired` to `NotStarted`.

* Remove extra character added.
This commit is contained in:
Jimmy Chen
2024-10-17 15:56:25 +11:00
committed by GitHub
parent 772929fae2
commit ee7fca3ebd
9 changed files with 454 additions and 246 deletions

View File

@@ -4,6 +4,7 @@ use crate::{
service::NetworkMessage,
sync::SyncMessage,
};
use beacon_chain::blob_verification::{GossipBlobError, GossipVerifiedBlob};
use beacon_chain::block_verification_types::AsBlock;
use beacon_chain::data_column_verification::{GossipDataColumnError, GossipVerifiedDataColumn};
use beacon_chain::store::Error;
@@ -18,13 +19,7 @@ use beacon_chain::{
AvailabilityProcessingStatus, BeaconChainError, BeaconChainTypes, BlockError, ForkChoiceError,
GossipVerifiedBlock, NotifyExecutionLayer,
};
use beacon_chain::{
blob_verification::{GossipBlobError, GossipVerifiedBlob},
data_availability_checker::DataColumnsToPublish,
};
use lighthouse_network::{
Client, MessageAcceptance, MessageId, PeerAction, PeerId, PubsubMessage, ReportSource,
};
use lighthouse_network::{Client, MessageAcceptance, MessageId, PeerAction, PeerId, ReportSource};
use operation_pool::ReceivedPreCapella;
use slog::{crit, debug, error, info, trace, warn, Logger};
use slot_clock::SlotClock;
@@ -171,26 +166,6 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
})
}
pub(crate) fn handle_data_columns_to_publish(
&self,
data_columns_to_publish: DataColumnsToPublish<T::EthSpec>,
) {
if let Some(data_columns_to_publish) = data_columns_to_publish {
self.send_network_message(NetworkMessage::Publish {
messages: data_columns_to_publish
.iter()
.map(|d| {
let subnet = DataColumnSubnetId::from_column_index::<T::EthSpec>(
d.index as usize,
&self.chain.spec,
);
PubsubMessage::DataColumnSidecar(Box::new((subnet, d.clone())))
})
.collect(),
});
}
}
/// Send a message on `message_tx` that the `message_id` sent by `peer_id` should be propagated on
/// the gossip network.
///
@@ -1022,9 +997,7 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
.process_gossip_data_columns(vec![verified_data_column], || Ok(()))
.await
{
Ok((availability, data_columns_to_publish)) => {
self.handle_data_columns_to_publish(data_columns_to_publish);
Ok(availability) => {
match availability {
AvailabilityProcessingStatus::Imported(block_root) => {
// Note: Reusing block imported metric here
@@ -1052,7 +1025,7 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
"block_root" => %block_root,
);
// Potentially trigger reconstruction
self.attempt_data_column_reconstruction(block_root).await;
}
}
}

View File

@@ -2,7 +2,9 @@ use crate::sync::manager::BlockProcessType;
use crate::sync::SamplingId;
use crate::{service::NetworkMessage, sync::manager::SyncMessage};
use beacon_chain::block_verification_types::RpcBlock;
use beacon_chain::{builder::Witness, eth1_chain::CachingEth1Backend, BeaconChain};
use beacon_chain::{
builder::Witness, eth1_chain::CachingEth1Backend, AvailabilityProcessingStatus, BeaconChain,
};
use beacon_chain::{BeaconChainTypes, NotifyExecutionLayer};
use beacon_processor::{
work_reprocessing_queue::ReprocessQueueMessage, BeaconProcessorChannels, BeaconProcessorSend,
@@ -16,9 +18,9 @@ use lighthouse_network::rpc::methods::{
use lighthouse_network::rpc::{RequestId, SubstreamId};
use lighthouse_network::{
rpc::{BlocksByRangeRequest, BlocksByRootRequest, LightClientBootstrapRequest, StatusMessage},
Client, MessageId, NetworkGlobals, PeerId,
Client, MessageId, NetworkGlobals, PeerId, PubsubMessage,
};
use slog::{debug, Logger};
use slog::{debug, error, trace, Logger};
use slot_clock::ManualSlotClock;
use std::path::PathBuf;
use std::sync::Arc;
@@ -848,6 +850,75 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
"error" => %e)
});
}
/// Attempt to reconstruct all data columns if the following conditions satisfies:
/// - Our custody requirement is all columns
/// - We have >= 50% of columns, but not all columns
///
/// Returns `Some(AvailabilityProcessingStatus)` if reconstruction is successfully performed,
/// otherwise returns `None`.
async fn attempt_data_column_reconstruction(
&self,
block_root: Hash256,
) -> Option<AvailabilityProcessingStatus> {
let result = self.chain.reconstruct_data_columns(block_root).await;
match result {
Ok(Some((availability_processing_status, data_columns_to_publish))) => {
self.send_network_message(NetworkMessage::Publish {
messages: data_columns_to_publish
.iter()
.map(|d| {
let subnet = DataColumnSubnetId::from_column_index::<T::EthSpec>(
d.index as usize,
&self.chain.spec,
);
PubsubMessage::DataColumnSidecar(Box::new((subnet, d.clone())))
})
.collect(),
});
match &availability_processing_status {
AvailabilityProcessingStatus::Imported(hash) => {
debug!(
self.log,
"Block components available via reconstruction";
"result" => "imported block and custody columns",
"block_hash" => %hash,
);
self.chain.recompute_head_at_current_slot().await;
}
AvailabilityProcessingStatus::MissingComponents(_, _) => {
debug!(
self.log,
"Block components still missing block after reconstruction";
"result" => "imported all custody columns",
"block_hash" => %block_root,
);
}
}
Some(availability_processing_status)
}
Ok(None) => {
// reason is tracked via the `KZG_DATA_COLUMN_RECONSTRUCTION_INCOMPLETE_TOTAL` metric
trace!(
self.log,
"Reconstruction not required for block";
"block_hash" => %block_root,
);
None
}
Err(e) => {
error!(
self.log,
"Error during data column reconstruction";
"block_root" => %block_root,
"error" => ?e
);
None
}
}
}
}
type TestBeaconChainType<E> =

View File

@@ -327,34 +327,37 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
_seen_timestamp: Duration,
process_type: BlockProcessType,
) {
let result = self
let mut result = self
.chain
.process_rpc_custody_columns(custody_columns)
.await;
match &result {
Ok((availability, data_columns_to_publish)) => {
self.handle_data_columns_to_publish(data_columns_to_publish.clone());
match availability {
AvailabilityProcessingStatus::Imported(hash) => {
debug!(
self.log,
"Block components retrieved";
"result" => "imported block and custody columns",
"block_hash" => %hash,
);
self.chain.recompute_head_at_current_slot().await;
}
AvailabilityProcessingStatus::MissingComponents(_, _) => {
debug!(
self.log,
"Missing components over rpc";
"block_hash" => %block_root,
);
Ok(availability) => match availability {
AvailabilityProcessingStatus::Imported(hash) => {
debug!(
self.log,
"Block components retrieved";
"result" => "imported block and custody columns",
"block_hash" => %hash,
);
self.chain.recompute_head_at_current_slot().await;
}
AvailabilityProcessingStatus::MissingComponents(_, _) => {
debug!(
self.log,
"Missing components over rpc";
"block_hash" => %block_root,
);
// Attempt reconstruction here before notifying sync, to avoid sending out more requests
// that we may no longer need.
if let Some(availability) =
self.attempt_data_column_reconstruction(block_root).await
{
result = Ok(availability)
}
}
}
},
Err(BlockError::DuplicateFullyImported(_)) => {
debug!(
self.log,
@@ -374,7 +377,7 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
self.send_sync_message(SyncMessage::BlockComponentProcessed {
process_type,
result: result.map(|(r, _)| r).into(),
result: result.into(),
});
}