Improving blob propagation post-PeerDAS with Decentralized Blob Building (#6268)

* Get blobs from EL. Co-authored-by: Michael Sproul <michael@sigmaprime.io> * Avoid cloning blobs after fetching blobs. * Address review comments and refactor code. * Fix lint. * Move blob computation metric to the right spot. * Merge branch 'unstable' into das-fetch-blobs * Merge branch 'unstable' into das-fetch-blobs # Conflicts: # beacon_node/beacon_chain/src/beacon_chain.rs # beacon_node/beacon_chain/src/block_verification.rs # beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs * Merge branch 'unstable' into das-fetch-blobs # Conflicts: # beacon_node/beacon_chain/src/beacon_chain.rs * Gradual publication of data columns for supernodes. * Recompute head after importing block with blobs from the EL. * Fix lint * Merge branch 'unstable' into das-fetch-blobs * Use blocking task instead of async when computing cells. * Merge branch 'das-fetch-blobs' of github.com:jimmygchen/lighthouse into das-fetch-blobs * Merge remote-tracking branch 'origin/unstable' into das-fetch-blobs * Fix semantic conflicts * Downgrade error log. * Merge branch 'unstable' into das-fetch-blobs # Conflicts: # beacon_node/beacon_chain/src/data_availability_checker.rs # beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs # beacon_node/execution_layer/src/engine_api.rs # beacon_node/execution_layer/src/engine_api/json_structures.rs # beacon_node/network/src/network_beacon_processor/gossip_methods.rs # beacon_node/network/src/network_beacon_processor/mod.rs # beacon_node/network/src/network_beacon_processor/sync_methods.rs * Merge branch 'unstable' into das-fetch-blobs * Publish block without waiting for blob and column proof computation. * Address review comments and refactor. * Merge branch 'unstable' into das-fetch-blobs * Fix test and docs. * Comment cleanups. * Merge branch 'unstable' into das-fetch-blobs * Address review comments and cleanup * Address review comments and cleanup * Refactor to de-duplicate gradual publication logic. * Add more logging. * Merge remote-tracking branch 'origin/unstable' into das-fetch-blobs # Conflicts: # Cargo.lock * Fix incorrect comparison on `num_fetched_blobs`. * Implement gradual blob publication. * Merge branch 'unstable' into das-fetch-blobs * Inline `publish_fn`. * Merge branch 'das-fetch-blobs' of github.com:jimmygchen/lighthouse into das-fetch-blobs * Gossip verify blobs before publishing * Avoid queries for 0 blobs and error for duplicates * Gossip verified engine blob before processing them, and use observe cache to detect duplicates before publishing. * Merge branch 'das-fetch-blobs' of github.com:jimmygchen/lighthouse into das-fetch-blobs # Conflicts: # beacon_node/network/src/network_beacon_processor/mod.rs * Merge branch 'unstable' into das-fetch-blobs * Fix invalid commitment inclusion proofs in blob sidecars created from EL blobs. * Only publish EL blobs triggered from gossip block, and not RPC block. * Downgrade gossip blob log to `debug`. * Merge branch 'unstable' into das-fetch-blobs * Merge branch 'unstable' into das-fetch-blobs * Grammar
2026-06-15 09:48:20 +00:00 · 2024-11-15 10:34:13 +07:00
parent 8e95024945
commit 5f053b0b6d
36 changed files with 1660 additions and 613 deletions
--- a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs
+++ b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs
@@ -914,18 +914,15 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
        let blob_slot = verified_blob.slot();
        let blob_index = verified_blob.id().index;

-        let result = self
-            .chain
-            .process_gossip_blob(verified_blob, || Ok(()))
-            .await;
+        let result = self.chain.process_gossip_blob(verified_blob).await;

        match &result {
            Ok(AvailabilityProcessingStatus::Imported(block_root)) => {
                // Note: Reusing block imported metric here
                metrics::inc_counter(&metrics::BEACON_PROCESSOR_GOSSIP_BLOCK_IMPORTED_TOTAL);
-                info!(
+                debug!(
                    self.log,
-                    "Gossipsub blob processed, imported fully available block";
+                    "Gossipsub blob processed - imported fully available block";
                    "block_root" => %block_root
                );
                self.chain.recompute_head_at_current_slot().await;
@@ -936,9 +933,9 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
                );
            }
            Ok(AvailabilityProcessingStatus::MissingComponents(slot, block_root)) => {
-                trace!(
+                debug!(
                    self.log,
-                    "Processed blob, waiting for other components";
+                    "Processed gossip blob - waiting for other components";
                    "slot" => %slot,
                    "blob_index" => %blob_index,
                    "block_root" => %block_root,
@@ -1079,7 +1076,7 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
                message_id,
                peer_id,
                peer_client,
-                block,
+                block.clone(),
                reprocess_tx.clone(),
                seen_duration,
            )
@@ -1497,6 +1494,13 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
                    "slot" => slot,
                    "block_root" => %block_root,
                );
+
+                // Block is valid, we can now attempt fetching blobs from EL using version hashes
+                // derived from kzg commitments from the block, without having to wait for all blobs
+                // to be sent from the peers if we already have them.
+                let publish_blobs = true;
+                self.fetch_engine_blobs_and_publish(block.clone(), *block_root, publish_blobs)
+                    .await;
            }
            Err(BlockError::ParentUnknown { .. }) => {
                // This should not occur. It should be checked by `should_forward_block`.
--- a/beacon_node/network/src/network_beacon_processor/mod.rs
+++ b/beacon_node/network/src/network_beacon_processor/mod.rs
@@ -1,11 +1,17 @@
 use crate::sync::manager::BlockProcessType;
 use crate::sync::SamplingId;
 use crate::{service::NetworkMessage, sync::manager::SyncMessage};
+use beacon_chain::blob_verification::{GossipBlobError, GossipVerifiedBlob};
 use beacon_chain::block_verification_types::RpcBlock;
+use beacon_chain::data_column_verification::{observe_gossip_data_column, GossipDataColumnError};
+use beacon_chain::fetch_blobs::{
+    fetch_and_process_engine_blobs, BlobsOrDataColumns, FetchEngineBlobError,
+};
+use beacon_chain::observed_data_sidecars::DoNotObserve;
 use beacon_chain::{
    builder::Witness, eth1_chain::CachingEth1Backend, AvailabilityProcessingStatus, BeaconChain,
+    BeaconChainTypes, BlockError, NotifyExecutionLayer,
 };
-use beacon_chain::{BeaconChainTypes, NotifyExecutionLayer};
 use beacon_processor::{
    work_reprocessing_queue::ReprocessQueueMessage, BeaconProcessorChannels, BeaconProcessorSend,
    DuplicateCache, GossipAggregatePackage, GossipAttestationPackage, Work,
@@ -21,7 +27,8 @@ use lighthouse_network::{
    rpc::{BlocksByRangeRequest, BlocksByRootRequest, LightClientBootstrapRequest, StatusMessage},
    Client, MessageId, NetworkGlobals, PeerId, PubsubMessage,
 };
-use slog::{debug, error, trace, Logger};
+use rand::prelude::SliceRandom;
+use slog::{debug, error, trace, warn, Logger};
 use slot_clock::ManualSlotClock;
 use std::path::PathBuf;
 use std::sync::Arc;
@@ -67,6 +74,9 @@ pub struct NetworkBeaconProcessor<T: BeaconChainTypes> {
    pub log: Logger,
 }

+// Publish blobs in batches of exponentially increasing size.
+const BLOB_PUBLICATION_EXP_FACTOR: usize = 2;
+
 impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
    fn try_send(&self, event: BeaconWorkEvent<T::EthSpec>) -> Result<(), Error<T::EthSpec>> {
        self.beacon_processor_send
@@ -878,6 +888,79 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
        });
    }

+    pub async fn fetch_engine_blobs_and_publish(
+        self: &Arc<Self>,
+        block: Arc<SignedBeaconBlock<T::EthSpec, FullPayload<T::EthSpec>>>,
+        block_root: Hash256,
+        publish_blobs: bool,
+    ) {
+        let self_cloned = self.clone();
+        let publish_fn = move |blobs_or_data_column| {
+            if publish_blobs {
+                match blobs_or_data_column {
+                    BlobsOrDataColumns::Blobs(blobs) => {
+                        self_cloned.publish_blobs_gradually(blobs, block_root);
+                    }
+                    BlobsOrDataColumns::DataColumns(columns) => {
+                        self_cloned.publish_data_columns_gradually(columns, block_root);
+                    }
+                };
+            }
+        };
+
+        match fetch_and_process_engine_blobs(
+            self.chain.clone(),
+            block_root,
+            block.clone(),
+            publish_fn,
+        )
+        .await
+        {
+            Ok(Some(availability)) => match availability {
+                AvailabilityProcessingStatus::Imported(_) => {
+                    debug!(
+                        self.log,
+                        "Block components retrieved from EL";
+                        "result" => "imported block and custody columns",
+                        "block_root" => %block_root,
+                    );
+                    self.chain.recompute_head_at_current_slot().await;
+                }
+                AvailabilityProcessingStatus::MissingComponents(_, _) => {
+                    debug!(
+                        self.log,
+                        "Still missing blobs after engine blobs processed successfully";
+                        "block_root" => %block_root,
+                    );
+                }
+            },
+            Ok(None) => {
+                debug!(
+                    self.log,
+                    "Fetch blobs completed without import";
+                    "block_root" => %block_root,
+                );
+            }
+            Err(FetchEngineBlobError::BlobProcessingError(BlockError::DuplicateFullyImported(
+                ..,
+            ))) => {
+                debug!(
+                    self.log,
+                    "Fetch blobs duplicate import";
+                    "block_root" => %block_root,
+                );
+            }
+            Err(e) => {
+                error!(
+                    self.log,
+                    "Error fetching or processing blobs from EL";
+                    "error" => ?e,
+                    "block_root" => %block_root,
+                );
+            }
+        }
+    }
+
    /// Attempt to reconstruct all data columns if the following conditions satisfies:
    /// - Our custody requirement is all columns
    /// - We have >= 50% of columns, but not all columns
@@ -885,25 +968,13 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
    /// Returns `Some(AvailabilityProcessingStatus)` if reconstruction is successfully performed,
    /// otherwise returns `None`.
    async fn attempt_data_column_reconstruction(
-        &self,
+        self: &Arc<Self>,
        block_root: Hash256,
    ) -> Option<AvailabilityProcessingStatus> {
        let result = self.chain.reconstruct_data_columns(block_root).await;
        match result {
            Ok(Some((availability_processing_status, data_columns_to_publish))) => {
-                self.send_network_message(NetworkMessage::Publish {
-                    messages: data_columns_to_publish
-                        .iter()
-                        .map(|d| {
-                            let subnet = DataColumnSubnetId::from_column_index::<T::EthSpec>(
-                                d.index as usize,
-                                &self.chain.spec,
-                            );
-                            PubsubMessage::DataColumnSidecar(Box::new((subnet, d.clone())))
-                        })
-                        .collect(),
-                });
-
+                self.publish_data_columns_gradually(data_columns_to_publish, block_root);
                match &availability_processing_status {
                    AvailabilityProcessingStatus::Imported(hash) => {
                        debug!(
@@ -946,6 +1017,175 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
            }
        }
    }
+
+    /// This function gradually publishes blobs to the network in randomised batches.
+    ///
+    /// This is an optimisation to reduce outbound bandwidth and ensures each blob is published
+    /// by some nodes on the network as soon as possible. Our hope is that some blobs arrive from
+    /// other nodes in the meantime, obviating the need for us to publish them. If no other
+    /// publisher exists for a blob, it will eventually get published here.
+    fn publish_blobs_gradually(
+        self: &Arc<Self>,
+        mut blobs: Vec<GossipVerifiedBlob<T, DoNotObserve>>,
+        block_root: Hash256,
+    ) {
+        let self_clone = self.clone();
+
+        self.executor.spawn(
+            async move {
+                let chain = self_clone.chain.clone();
+                let log = self_clone.chain.logger();
+                let publish_fn = |blobs: Vec<Arc<BlobSidecar<T::EthSpec>>>| {
+                    self_clone.send_network_message(NetworkMessage::Publish {
+                        messages: blobs
+                            .into_iter()
+                            .map(|blob| PubsubMessage::BlobSidecar(Box::new((blob.index, blob))))
+                            .collect(),
+                    });
+                };
+
+                // Permute the blobs and split them into batches.
+                // The hope is that we won't need to publish some blobs because we will receive them
+                // on gossip from other nodes.
+                blobs.shuffle(&mut rand::thread_rng());
+
+                let blob_publication_batch_interval = chain.config.blob_publication_batch_interval;
+                let mut publish_count = 0usize;
+                let blob_count = blobs.len();
+                let mut blobs_iter = blobs.into_iter().peekable();
+                let mut batch_size = 1usize;
+
+                while blobs_iter.peek().is_some() {
+                    let batch = blobs_iter.by_ref().take(batch_size);
+                    let publishable = batch
+                        .filter_map(|unobserved| match unobserved.observe(&chain) {
+                            Ok(observed) => Some(observed.clone_blob()),
+                            Err(GossipBlobError::RepeatBlob { .. }) => None,
+                            Err(e) => {
+                                warn!(
+                                    log,
+                                    "Previously verified blob is invalid";
+                                    "error" => ?e
+                                );
+                                None
+                            }
+                        })
+                        .collect::<Vec<_>>();
+
+                    if !publishable.is_empty() {
+                        debug!(
+                            log,
+                            "Publishing blob batch";
+                            "publish_count" => publishable.len(),
+                            "block_root" => ?block_root,
+                        );
+                        publish_count += publishable.len();
+                        publish_fn(publishable);
+                    }
+
+                    tokio::time::sleep(blob_publication_batch_interval).await;
+                    batch_size *= BLOB_PUBLICATION_EXP_FACTOR;
+                }
+
+                debug!(
+                    log,
+                    "Batch blob publication complete";
+                    "batch_interval" => blob_publication_batch_interval.as_millis(),
+                    "blob_count" => blob_count,
+                    "published_count" => publish_count,
+                    "block_root" => ?block_root,
+                )
+            },
+            "gradual_blob_publication",
+        );
+    }
+
+    /// This function gradually publishes data columns to the network in randomised batches.
+    ///
+    /// This is an optimisation to reduce outbound bandwidth and ensures each column is published
+    /// by some nodes on the network as soon as possible. Our hope is that some columns arrive from
+    /// other supernodes in the meantime, obviating the need for us to publish them. If no other
+    /// publisher exists for a column, it will eventually get published here.
+    fn publish_data_columns_gradually(
+        self: &Arc<Self>,
+        mut data_columns_to_publish: DataColumnSidecarList<T::EthSpec>,
+        block_root: Hash256,
+    ) {
+        let self_clone = self.clone();
+
+        self.executor.spawn(
+            async move {
+                let chain = self_clone.chain.clone();
+                let log = self_clone.chain.logger();
+                let publish_fn = |columns: DataColumnSidecarList<T::EthSpec>| {
+                    self_clone.send_network_message(NetworkMessage::Publish {
+                        messages: columns
+                            .into_iter()
+                            .map(|d| {
+                                let subnet = DataColumnSubnetId::from_column_index::<T::EthSpec>(
+                                    d.index as usize,
+                                    &chain.spec,
+                                );
+                                PubsubMessage::DataColumnSidecar(Box::new((subnet, d)))
+                            })
+                            .collect(),
+                    });
+                };
+
+                // If this node is a super node, permute the columns and split them into batches.
+                // The hope is that we won't need to publish some columns because we will receive them
+                // on gossip from other supernodes.
+                data_columns_to_publish.shuffle(&mut rand::thread_rng());
+
+                let blob_publication_batch_interval = chain.config.blob_publication_batch_interval;
+                let blob_publication_batches = chain.config.blob_publication_batches;
+                let batch_size = chain.spec.number_of_columns / blob_publication_batches;
+                let mut publish_count = 0usize;
+
+                for batch in data_columns_to_publish.chunks(batch_size) {
+                    let publishable = batch
+                        .iter()
+                        .filter_map(|col| match observe_gossip_data_column(col, &chain) {
+                            Ok(()) => Some(col.clone()),
+                            Err(GossipDataColumnError::PriorKnown { .. }) => None,
+                            Err(e) => {
+                                warn!(
+                                    log,
+                                    "Previously verified data column is invalid";
+                                    "error" => ?e
+                                );
+                                None
+                            }
+                        })
+                        .collect::<Vec<_>>();
+
+                    if !publishable.is_empty() {
+                        debug!(
+                            log,
+                            "Publishing data column batch";
+                            "publish_count" => publishable.len(),
+                            "block_root" => ?block_root,
+                        );
+                        publish_count += publishable.len();
+                        publish_fn(publishable);
+                    }
+
+                    tokio::time::sleep(blob_publication_batch_interval).await;
+                }
+
+                debug!(
+                    log,
+                    "Batch data column publishing complete";
+                    "batch_size" => batch_size,
+                    "batch_interval" => blob_publication_batch_interval.as_millis(),
+                    "data_columns_to_publish_count" => data_columns_to_publish.len(),
+                    "published_count" => publish_count,
+                    "block_root" => ?block_root,
+                )
+            },
+            "gradual_data_column_publication",
+        );
+    }
 }

 type TestBeaconChainType<E> =
--- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs
+++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs
@@ -153,6 +153,7 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
            "process_type" => ?process_type,
        );

+        let signed_beacon_block = block.block_cloned();
        let result = self
            .chain
            .process_block_with_early_caching(
@@ -166,26 +167,37 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
        metrics::inc_counter(&metrics::BEACON_PROCESSOR_RPC_BLOCK_IMPORTED_TOTAL);

        // RPC block imported, regardless of process type
-        if let &Ok(AvailabilityProcessingStatus::Imported(hash)) = &result {
-            info!(self.log, "New RPC block received"; "slot" => slot, "hash" => %hash);
+        match result.as_ref() {
+            Ok(AvailabilityProcessingStatus::Imported(hash)) => {
+                info!(self.log, "New RPC block received"; "slot" => slot, "hash" => %hash);

-            // Trigger processing for work referencing this block.
-            let reprocess_msg = ReprocessQueueMessage::BlockImported {
-                block_root: hash,
-                parent_root,
-            };
-            if reprocess_tx.try_send(reprocess_msg).is_err() {
-                error!(self.log, "Failed to inform block import"; "source" => "rpc", "block_root" => %hash)
-            };
-            self.chain.block_times_cache.write().set_time_observed(
-                hash,
-                slot,
-                seen_timestamp,
-                None,
-                None,
-            );
+                // Trigger processing for work referencing this block.
+                let reprocess_msg = ReprocessQueueMessage::BlockImported {
+                    block_root: *hash,
+                    parent_root,
+                };
+                if reprocess_tx.try_send(reprocess_msg).is_err() {
+                    error!(self.log, "Failed to inform block import"; "source" => "rpc", "block_root" => %hash)
+                };
+                self.chain.block_times_cache.write().set_time_observed(
+                    *hash,
+                    slot,
+                    seen_timestamp,
+                    None,
+                    None,
+                );

-            self.chain.recompute_head_at_current_slot().await;
+                self.chain.recompute_head_at_current_slot().await;
+            }
+            Ok(AvailabilityProcessingStatus::MissingComponents(..)) => {
+                // Block is valid, we can now attempt fetching blobs from EL using version hashes
+                // derived from kzg commitments from the block, without having to wait for all blobs
+                // to be sent from the peers if we already have them.
+                let publish_blobs = false;
+                self.fetch_engine_blobs_and_publish(signed_beacon_block, block_root, publish_blobs)
+                    .await
+            }
+            _ => {}
        }

        // RPC block imported or execution validated. If the block was already imported by gossip we