Refactor data column reconstruction and avoid blocking processing (#6403)

* Move reconstruction logic out of `overflow_lru_cache` to simplify the code and avoids having to pass `DataColumnsToPublish` around and blocking other processing. * Publish reconstructed cells before recomputing head. Remove duplicate functions. * Merge branch 'unstable' into non-blocking-reconstruction * Merge branch 'unstable' into non-blocking-reconstruction # Conflicts: # beacon_node/beacon_chain/src/beacon_chain.rs # beacon_node/beacon_chain/src/data_availability_checker.rs # beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs # beacon_node/network/src/network_beacon_processor/sync_methods.rs * Spawn a blocking task for reconstruction. * Merge branch 'unstable' into non-blocking-reconstruction # Conflicts: # beacon_node/network/src/network_beacon_processor/mod.rs * Fix fmt * Merge branch 'unstable' into non-blocking-reconstruction # Conflicts: # beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs * Fix race condition by making check and mutation atomic as suggested by Lion. Also added error handling to reconstruction failure. * Add reconstruction reason metric and more debug logging to da checker. * Add comment and logging. * Rename `NotRequired` to `NotStarted`. * Remove extra character added.
2026-05-08 17:26:04 +00:00 · 2024-10-17 15:56:25 +11:00
parent 772929fae2
commit ee7fca3ebd
9 changed files with 454 additions and 246 deletions
--- a/beacon_node/beacon_chain/src/beacon_chain.rs
+++ b/beacon_node/beacon_chain/src/beacon_chain.rs
@@ -22,7 +22,7 @@ pub use crate::canonical_head::CanonicalHead;
 use crate::chain_config::ChainConfig;
 use crate::data_availability_checker::{
    Availability, AvailabilityCheckError, AvailableBlock, DataAvailabilityChecker,
-    DataColumnsToPublish,
+    DataColumnReconstructionResult,
 };
 use crate::data_column_verification::{GossipDataColumnError, GossipVerifiedDataColumn};
 use crate::early_attester_cache::EarlyAttesterCache;
@@ -3015,13 +3015,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
        self: &Arc<Self>,
        data_columns: Vec<GossipVerifiedDataColumn<T>>,
        publish_fn: impl FnOnce() -> Result<(), BlockError>,
-    ) -> Result<
-        (
-            AvailabilityProcessingStatus,
-            DataColumnsToPublish<T::EthSpec>,
-        ),
-        BlockError,
-    > {
+    ) -> Result<AvailabilityProcessingStatus, BlockError> {
        let Ok((slot, block_root)) = data_columns
            .iter()
            .map(|c| (c.slot(), c.block_root()))
@@ -3051,7 +3045,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
                publish_fn,
            )
            .await;
-        self.remove_notified_custody_columns(&block_root, r)
+        self.remove_notified(&block_root, r)
    }

    /// Cache the blobs in the processing cache, process it, then evict it from the cache if it was
@@ -3110,13 +3104,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
    pub async fn process_rpc_custody_columns(
        self: &Arc<Self>,
        custody_columns: DataColumnSidecarList<T::EthSpec>,
-    ) -> Result<
-        (
-            AvailabilityProcessingStatus,
-            DataColumnsToPublish<T::EthSpec>,
-        ),
-        BlockError,
-    > {
+    ) -> Result<AvailabilityProcessingStatus, BlockError> {
        let Ok((slot, block_root)) = custody_columns
            .iter()
            .map(|c| (c.slot(), c.block_root()))
@@ -3154,7 +3142,67 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
        let r = self
            .check_rpc_custody_columns_availability_and_import(slot, block_root, custody_columns)
            .await;
-        self.remove_notified_custody_columns(&block_root, r)
+        self.remove_notified(&block_root, r)
+    }
+
+    pub async fn reconstruct_data_columns(
+        self: &Arc<Self>,
+        block_root: Hash256,
+    ) -> Result<
+        Option<(
+            AvailabilityProcessingStatus,
+            DataColumnSidecarList<T::EthSpec>,
+        )>,
+        BlockError,
+    > {
+        // As of now we only reconstruct data columns on supernodes, so if the block is already
+        // available on a supernode, there's no need to reconstruct as the node must already have
+        // all columns.
+        if self
+            .canonical_head
+            .fork_choice_read_lock()
+            .contains_block(&block_root)
+        {
+            return Ok(None);
+        }
+
+        let data_availability_checker = self.data_availability_checker.clone();
+
+        let result = self
+            .task_executor
+            .spawn_blocking_handle(
+                move || data_availability_checker.reconstruct_data_columns(&block_root),
+                "reconstruct_data_columns",
+            )
+            .ok_or(BeaconChainError::RuntimeShutdown)?
+            .await
+            .map_err(BeaconChainError::TokioJoin)??;
+
+        match result {
+            DataColumnReconstructionResult::Success((availability, data_columns_to_publish)) => {
+                let Some(slot) = data_columns_to_publish.first().map(|d| d.slot()) else {
+                    // This should be unreachable because empty result would return `RecoveredColumnsNotImported` instead of success.
+                    return Ok(None);
+                };
+
+                let r = self
+                    .process_availability(slot, availability, || Ok(()))
+                    .await;
+                self.remove_notified(&block_root, r)
+                    .map(|availability_processing_status| {
+                        Some((availability_processing_status, data_columns_to_publish))
+                    })
+            }
+            DataColumnReconstructionResult::NotStarted(reason)
+            | DataColumnReconstructionResult::RecoveredColumnsNotImported(reason) => {
+                // We use metric here because logging this would be *very* noisy.
+                metrics::inc_counter_vec(
+                    &metrics::KZG_DATA_COLUMN_RECONSTRUCTION_INCOMPLETE_TOTAL,
+                    &[reason],
+                );
+                Ok(None)
+            }
+        }
    }

    /// Remove any block components from the *processing cache* if we no longer require them. If the
@@ -3172,23 +3220,6 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
        r
    }

-    /// Remove any block components from the *processing cache* if we no longer require them. If the
-    /// block was imported full or erred, we no longer require them.
-    fn remove_notified_custody_columns<P>(
-        &self,
-        block_root: &Hash256,
-        r: Result<(AvailabilityProcessingStatus, P), BlockError>,
-    ) -> Result<(AvailabilityProcessingStatus, P), BlockError> {
-        let has_missing_components = matches!(
-            r,
-            Ok((AvailabilityProcessingStatus::MissingComponents(_, _), _))
-        );
-        if !has_missing_components {
-            self.reqresp_pre_import_cache.write().remove(block_root);
-        }
-        r
-    }
-
    /// Wraps `process_block` in logic to cache the block's commitments in the processing cache
    /// and evict if the block was imported or errored.
    pub async fn process_block_with_early_caching<B: IntoExecutionPendingBlock<T>>(
@@ -3444,26 +3475,21 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
        block_root: Hash256,
        data_columns: Vec<GossipVerifiedDataColumn<T>>,
        publish_fn: impl FnOnce() -> Result<(), BlockError>,
-    ) -> Result<
-        (
-            AvailabilityProcessingStatus,
-            DataColumnsToPublish<T::EthSpec>,
-        ),
-        BlockError,
-    > {
+    ) -> Result<AvailabilityProcessingStatus, BlockError> {
        if let Some(slasher) = self.slasher.as_ref() {
            for data_colum in &data_columns {
                slasher.accept_block_header(data_colum.signed_block_header());
            }
        }

-        let (availability, data_columns_to_publish) = self
-            .data_availability_checker
-            .put_gossip_data_columns(slot, block_root, data_columns)?;
+        let availability = self.data_availability_checker.put_gossip_data_columns(
+            slot,
+            block_root,
+            data_columns,
+        )?;

        self.process_availability(slot, availability, publish_fn)
            .await
-            .map(|result| (result, data_columns_to_publish))
    }

    /// Checks if the provided blobs can make any cached blocks available, and imports immediately
@@ -3513,13 +3539,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
        slot: Slot,
        block_root: Hash256,
        custody_columns: DataColumnSidecarList<T::EthSpec>,
-    ) -> Result<
-        (
-            AvailabilityProcessingStatus,
-            DataColumnsToPublish<T::EthSpec>,
-        ),
-        BlockError,
-    > {
+    ) -> Result<AvailabilityProcessingStatus, BlockError> {
        // Need to scope this to ensure the lock is dropped before calling `process_availability`
        // Even an explicit drop is not enough to convince the borrow checker.
        {
@@ -3544,16 +3564,14 @@ impl<T: BeaconChainTypes> BeaconChain<T> {

        // This slot value is purely informative for the consumers of
        // `AvailabilityProcessingStatus::MissingComponents` to log an error with a slot.
-        let (availability, data_columns_to_publish) =
-            self.data_availability_checker.put_rpc_custody_columns(
-                block_root,
-                slot.epoch(T::EthSpec::slots_per_epoch()),
-                custody_columns,
-            )?;
+        let availability = self.data_availability_checker.put_rpc_custody_columns(
+            block_root,
+            slot.epoch(T::EthSpec::slots_per_epoch()),
+            custody_columns,
+        )?;

        self.process_availability(slot, availability, || Ok(()))
            .await
-            .map(|result| (result, data_columns_to_publish))
    }

    /// Imports a fully available block. Otherwise, returns `AvailabilityProcessingStatus::MissingComponents`
--- a/beacon_node/beacon_chain/src/builder.rs
+++ b/beacon_node/beacon_chain/src/builder.rs
@@ -984,6 +984,7 @@ where
                    store,
                    self.import_all_data_columns,
                    self.spec,
+                    log.new(o!("service" => "data_availability_checker")),
                )
                .map_err(|e| format!("Error initializing DataAvailabilityChecker: {:?}", e))?,
            ),
--- a/beacon_node/beacon_chain/src/data_availability_checker.rs
+++ b/beacon_node/beacon_chain/src/data_availability_checker.rs
@@ -2,10 +2,12 @@ use crate::blob_verification::{verify_kzg_for_blob_list, GossipVerifiedBlob, Kzg
 use crate::block_verification_types::{
    AvailabilityPendingExecutedBlock, AvailableExecutedBlock, RpcBlock,
 };
-use crate::data_availability_checker::overflow_lru_cache::DataAvailabilityCheckerInner;
-use crate::{BeaconChain, BeaconChainTypes, BeaconStore};
+use crate::data_availability_checker::overflow_lru_cache::{
+    DataAvailabilityCheckerInner, ReconstructColumnsDecision,
+};
+use crate::{metrics, BeaconChain, BeaconChainTypes, BeaconStore};
 use kzg::Kzg;
-use slog::{debug, error};
+use slog::{debug, error, Logger};
 use slot_clock::SlotClock;
 use std::fmt;
 use std::fmt::Debug;
@@ -27,11 +29,12 @@ use crate::data_column_verification::{
    verify_kzg_for_data_column, verify_kzg_for_data_column_list, CustodyDataColumn,
    GossipVerifiedDataColumn, KzgVerifiedCustodyDataColumn, KzgVerifiedDataColumn,
 };
+use crate::metrics::{
+    KZG_DATA_COLUMN_RECONSTRUCTION_ATTEMPTS, KZG_DATA_COLUMN_RECONSTRUCTION_FAILURES,
+};
 pub use error::{Error as AvailabilityCheckError, ErrorCategory as AvailabilityCheckErrorCategory};
 use types::non_zero_usize::new_non_zero_usize;

-pub use self::overflow_lru_cache::DataColumnsToPublish;
-
 /// The LRU Cache stores `PendingComponents` which can store up to
 /// `MAX_BLOBS_PER_BLOCK = 6` blobs each. A `BlobSidecar` is 0.131256 MB. So
 /// the maximum size of a `PendingComponents` is ~ 0.787536 MB. Setting this
@@ -71,6 +74,16 @@ pub struct DataAvailabilityChecker<T: BeaconChainTypes> {
    slot_clock: T::SlotClock,
    kzg: Arc<Kzg>,
    spec: Arc<ChainSpec>,
+    log: Logger,
+}
+
+pub type AvailabilityAndReconstructedColumns<E> = (Availability<E>, DataColumnSidecarList<E>);
+
+#[derive(Debug)]
+pub enum DataColumnReconstructionResult<E: EthSpec> {
+    Success(AvailabilityAndReconstructedColumns<E>),
+    NotStarted(&'static str),
+    RecoveredColumnsNotImported(&'static str),
 }

 /// This type is returned after adding a block / blob to the `DataAvailabilityChecker`.
@@ -101,6 +114,7 @@ impl<T: BeaconChainTypes> DataAvailabilityChecker<T> {
        store: BeaconStore<T>,
        import_all_data_columns: bool,
        spec: Arc<ChainSpec>,
+        log: Logger,
    ) -> Result<Self, AvailabilityCheckError> {
        let custody_subnet_count = if import_all_data_columns {
            spec.data_column_sidecar_subnet_count as usize
@@ -124,6 +138,7 @@ impl<T: BeaconChainTypes> DataAvailabilityChecker<T> {
            slot_clock,
            kzg,
            spec,
+            log,
        })
    }

@@ -205,7 +220,7 @@ impl<T: BeaconChainTypes> DataAvailabilityChecker<T> {
        .map_err(AvailabilityCheckError::InvalidBlobs)?;

        self.availability_cache
-            .put_kzg_verified_blobs(block_root, epoch, verified_blobs)
+            .put_kzg_verified_blobs(block_root, epoch, verified_blobs, &self.log)
    }

    /// Put a list of custody columns received via RPC into the availability cache. This performs KZG
@@ -216,8 +231,7 @@ impl<T: BeaconChainTypes> DataAvailabilityChecker<T> {
        block_root: Hash256,
        epoch: Epoch,
        custody_columns: DataColumnSidecarList<T::EthSpec>,
-    ) -> Result<(Availability<T::EthSpec>, DataColumnsToPublish<T::EthSpec>), AvailabilityCheckError>
-    {
+    ) -> Result<Availability<T::EthSpec>, AvailabilityCheckError> {
        // TODO(das): report which column is invalid for proper peer scoring
        // TODO(das): batch KZG verification here, but fallback into checking each column
        // individually to report which column(s) are invalid.
@@ -233,10 +247,10 @@ impl<T: BeaconChainTypes> DataAvailabilityChecker<T> {
            .collect::<Result<Vec<_>, AvailabilityCheckError>>()?;

        self.availability_cache.put_kzg_verified_data_columns(
-            &self.kzg,
            block_root,
            epoch,
            verified_custody_columns,
+            &self.log,
        )
    }

@@ -253,6 +267,7 @@ impl<T: BeaconChainTypes> DataAvailabilityChecker<T> {
            gossip_blob.block_root(),
            gossip_blob.epoch(),
            vec![gossip_blob.into_inner()],
+            &self.log,
        )
    }

@@ -267,8 +282,7 @@ impl<T: BeaconChainTypes> DataAvailabilityChecker<T> {
        slot: Slot,
        block_root: Hash256,
        gossip_data_columns: Vec<GossipVerifiedDataColumn<T>>,
-    ) -> Result<(Availability<T::EthSpec>, DataColumnsToPublish<T::EthSpec>), AvailabilityCheckError>
-    {
+    ) -> Result<Availability<T::EthSpec>, AvailabilityCheckError> {
        let epoch = slot.epoch(T::EthSpec::slots_per_epoch());

        let custody_columns = gossip_data_columns
@@ -277,10 +291,10 @@ impl<T: BeaconChainTypes> DataAvailabilityChecker<T> {
            .collect::<Vec<_>>();

        self.availability_cache.put_kzg_verified_data_columns(
-            &self.kzg,
            block_root,
            epoch,
            custody_columns,
+            &self.log,
        )
    }

@@ -291,7 +305,7 @@ impl<T: BeaconChainTypes> DataAvailabilityChecker<T> {
        executed_block: AvailabilityPendingExecutedBlock<T::EthSpec>,
    ) -> Result<Availability<T::EthSpec>, AvailabilityCheckError> {
        self.availability_cache
-            .put_pending_executed_block(executed_block)
+            .put_pending_executed_block(executed_block, &self.log)
    }

    pub fn remove_pending_components(&self, block_root: Hash256) {
@@ -511,6 +525,92 @@ impl<T: BeaconChainTypes> DataAvailabilityChecker<T> {
            block_cache_size: self.availability_cache.block_cache_size(),
        }
    }
+
+    pub fn reconstruct_data_columns(
+        &self,
+        block_root: &Hash256,
+    ) -> Result<DataColumnReconstructionResult<T::EthSpec>, AvailabilityCheckError> {
+        let pending_components = match self
+            .availability_cache
+            .check_and_set_reconstruction_started(block_root)
+        {
+            ReconstructColumnsDecision::Yes(pending_components) => pending_components,
+            ReconstructColumnsDecision::No(reason) => {
+                return Ok(DataColumnReconstructionResult::NotStarted(reason));
+            }
+        };
+
+        metrics::inc_counter(&KZG_DATA_COLUMN_RECONSTRUCTION_ATTEMPTS);
+        let timer = metrics::start_timer(&metrics::DATA_AVAILABILITY_RECONSTRUCTION_TIME);
+
+        let all_data_columns = KzgVerifiedCustodyDataColumn::reconstruct_columns(
+            &self.kzg,
+            &pending_components.verified_data_columns,
+            &self.spec,
+        )
+        .map_err(|e| {
+            error!(
+                self.log,
+                "Error reconstructing data columns";
+                "block_root" => ?block_root,
+                "error" => ?e
+            );
+            self.availability_cache
+                .handle_reconstruction_failure(block_root);
+            metrics::inc_counter(&KZG_DATA_COLUMN_RECONSTRUCTION_FAILURES);
+            AvailabilityCheckError::ReconstructColumnsError(e)
+        })?;
+
+        // Check indices from cache again to make sure we don't publish components we've already received.
+        let Some(existing_column_indices) = self.cached_data_column_indexes(block_root) else {
+            return Ok(DataColumnReconstructionResult::RecoveredColumnsNotImported(
+                "block already imported",
+            ));
+        };
+
+        let data_columns_to_publish = all_data_columns
+            .into_iter()
+            .filter(|d| !existing_column_indices.contains(&d.index()))
+            .collect::<Vec<_>>();
+
+        let Some(slot) = data_columns_to_publish
+            .first()
+            .map(|d| d.as_data_column().slot())
+        else {
+            return Ok(DataColumnReconstructionResult::RecoveredColumnsNotImported(
+                "No new columns to import and publish",
+            ));
+        };
+
+        metrics::stop_timer(timer);
+        metrics::inc_counter_by(
+            &metrics::DATA_AVAILABILITY_RECONSTRUCTED_COLUMNS,
+            data_columns_to_publish.len() as u64,
+        );
+
+        debug!(self.log, "Reconstructed columns";
+            "count" => data_columns_to_publish.len(),
+            "block_root" => ?block_root,
+            "slot" => slot,
+        );
+
+        self.availability_cache
+            .put_kzg_verified_data_columns(
+                *block_root,
+                slot.epoch(T::EthSpec::slots_per_epoch()),
+                data_columns_to_publish.clone(),
+                &self.log,
+            )
+            .map(|availability| {
+                DataColumnReconstructionResult::Success((
+                    availability,
+                    data_columns_to_publish
+                        .into_iter()
+                        .map(|d| d.clone_arc())
+                        .collect::<Vec<_>>(),
+                ))
+            })
+    }
 }

 /// Helper struct to group data availability checker metrics.
--- a/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs
+++ b/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs
@@ -6,23 +6,19 @@ use crate::block_verification_types::{
 };
 use crate::data_availability_checker::{Availability, AvailabilityCheckError};
 use crate::data_column_verification::KzgVerifiedCustodyDataColumn;
-use crate::metrics;
 use crate::BeaconChainTypes;
-use kzg::Kzg;
 use lru::LruCache;
 use parking_lot::RwLock;
+use slog::{debug, Logger};
 use ssz_types::{FixedVector, VariableList};
-use std::collections::HashSet;
 use std::num::NonZeroUsize;
 use std::sync::Arc;
 use types::blob_sidecar::BlobIdentifier;
 use types::{
-    BlobSidecar, ChainSpec, ColumnIndex, DataColumnIdentifier, DataColumnSidecar,
-    DataColumnSidecarList, Epoch, EthSpec, Hash256, SignedBeaconBlock,
+    BlobSidecar, ChainSpec, ColumnIndex, DataColumnIdentifier, DataColumnSidecar, Epoch, EthSpec,
+    Hash256, SignedBeaconBlock,
 };

-pub type DataColumnsToPublish<E> = Option<DataColumnSidecarList<E>>;
-
 /// This represents the components of a partially available block
 ///
 /// The blobs are all gossip and kzg verified.
@@ -95,7 +91,7 @@ impl<E: EthSpec> PendingComponents<E> {
    /// block.
    ///
    /// This corresponds to the number of commitments that are present in a block.
-    pub fn num_expected_blobs(&self) -> Option<usize> {
+    pub fn block_kzg_commitments_count(&self) -> Option<usize> {
        self.get_cached_block()
            .as_ref()
            .map(|b| b.get_commitments().len())
@@ -203,21 +199,61 @@ impl<E: EthSpec> PendingComponents<E> {
    ///
    /// Returns `true` if both the block exists and the number of received blobs / custody columns
    /// matches the number of expected blobs / custody columns.
-    pub fn is_available(&self, block_import_requirement: &BlockImportRequirement) -> bool {
+    pub fn is_available(
+        &self,
+        block_import_requirement: &BlockImportRequirement,
+        log: &Logger,
+    ) -> bool {
+        let block_kzg_commitments_count_opt = self.block_kzg_commitments_count();
+
        match block_import_requirement {
-            BlockImportRequirement::AllBlobs => self
-                .num_expected_blobs()
-                .map_or(false, |num_expected_blobs| {
-                    num_expected_blobs == self.num_received_blobs()
-                }),
+            BlockImportRequirement::AllBlobs => {
+                let received_blobs = self.num_received_blobs();
+                let expected_blobs_msg = block_kzg_commitments_count_opt
+                    .as_ref()
+                    .map(|num| num.to_string())
+                    .unwrap_or("unknown".to_string());
+
+                debug!(log,
+                    "Component(s) added to data availability checker";
+                    "block_root" => ?self.block_root,
+                    "received_block" => block_kzg_commitments_count_opt.is_some(),
+                    "received_blobs" => received_blobs,
+                    "expected_blobs" => expected_blobs_msg,
+                );
+
+                block_kzg_commitments_count_opt.map_or(false, |num_expected_blobs| {
+                    num_expected_blobs == received_blobs
+                })
+            }
            BlockImportRequirement::ColumnSampling(num_expected_columns) => {
-                let num_received_data_columns = self.num_received_data_columns();
                // No data columns when there are 0 blobs
-                self.num_expected_blobs()
-                    .map_or(false, |num_expected_blobs| {
-                        num_expected_blobs == 0
-                            || *num_expected_columns == num_received_data_columns
-                    })
+                let expected_columns_opt = block_kzg_commitments_count_opt.map(|blob_count| {
+                    if blob_count > 0 {
+                        *num_expected_columns
+                    } else {
+                        0
+                    }
+                });
+
+                let expected_columns_msg = expected_columns_opt
+                    .as_ref()
+                    .map(|num| num.to_string())
+                    .unwrap_or("unknown".to_string());
+
+                let num_received_columns = self.num_received_data_columns();
+
+                debug!(log,
+                    "Component(s) added to data availability checker";
+                    "block_root" => ?self.block_root,
+                    "received_block" => block_kzg_commitments_count_opt.is_some(),
+                    "received_columns" => num_received_columns,
+                    "expected_columns" => expected_columns_msg,
+                );
+
+                expected_columns_opt.map_or(false, |num_expected_columns| {
+                    num_expected_columns == num_received_columns
+                })
            }
        }
    }
@@ -311,10 +347,6 @@ impl<E: EthSpec> PendingComponents<E> {
        )))
    }

-    pub fn reconstruction_started(&mut self) {
-        self.reconstruction_started = true;
-    }
-
    /// Returns the epoch of the block if it is cached, otherwise returns the epoch of the first blob.
    pub fn epoch(&self) -> Option<Epoch> {
        self.executed_block
@@ -358,6 +390,15 @@ pub struct DataAvailabilityCheckerInner<T: BeaconChainTypes> {
    spec: Arc<ChainSpec>,
 }

+// This enum is only used internally within the crate in the reconstruction function to improve
+// readability, so it's OK to not box the variant value, and it shouldn't impact memory much with
+// the current usage, as it's deconstructed immediately.
+#[allow(clippy::large_enum_variant)]
+pub(crate) enum ReconstructColumnsDecision<E: EthSpec> {
+    Yes(PendingComponents<E>),
+    No(&'static str),
+}
+
 impl<T: BeaconChainTypes> DataAvailabilityCheckerInner<T> {
    pub fn new(
        capacity: NonZeroUsize,
@@ -448,33 +489,12 @@ impl<T: BeaconChainTypes> DataAvailabilityCheckerInner<T> {
        }
    }

-    /// Potentially trigger reconstruction if:
-    /// - Our custody requirement is all columns
-    /// - We >= 50% of columns, but not all columns
-    fn should_reconstruct(
-        &self,
-        block_import_requirement: &BlockImportRequirement,
-        pending_components: &PendingComponents<T::EthSpec>,
-    ) -> bool {
-        let BlockImportRequirement::ColumnSampling(num_expected_columns) = block_import_requirement
-        else {
-            return false;
-        };
-
-        let num_of_columns = self.spec.number_of_columns;
-        let has_missing_columns = pending_components.verified_data_columns.len() < num_of_columns;
-
-        has_missing_columns
-            && !pending_components.reconstruction_started
-            && *num_expected_columns == num_of_columns
-            && pending_components.verified_data_columns.len() >= num_of_columns / 2
-    }
-
    pub fn put_kzg_verified_blobs<I: IntoIterator<Item = KzgVerifiedBlob<T::EthSpec>>>(
        &self,
        block_root: Hash256,
        epoch: Epoch,
        kzg_verified_blobs: I,
+        log: &Logger,
    ) -> Result<Availability<T::EthSpec>, AvailabilityCheckError> {
        let mut fixed_blobs = FixedVector::default();

@@ -496,7 +516,7 @@ impl<T: BeaconChainTypes> DataAvailabilityCheckerInner<T> {
        pending_components.merge_blobs(fixed_blobs);

        let block_import_requirement = self.block_import_requirement(epoch)?;
-        if pending_components.is_available(&block_import_requirement) {
+        if pending_components.is_available(&block_import_requirement, log) {
            write_lock.put(block_root, pending_components.clone());
            // No need to hold the write lock anymore
            drop(write_lock);
@@ -514,12 +534,11 @@ impl<T: BeaconChainTypes> DataAvailabilityCheckerInner<T> {
        I: IntoIterator<Item = KzgVerifiedCustodyDataColumn<T::EthSpec>>,
    >(
        &self,
-        kzg: &Kzg,
        block_root: Hash256,
        epoch: Epoch,
        kzg_verified_data_columns: I,
-    ) -> Result<(Availability<T::EthSpec>, DataColumnsToPublish<T::EthSpec>), AvailabilityCheckError>
-    {
+        log: &Logger,
+    ) -> Result<Availability<T::EthSpec>, AvailabilityCheckError> {
        let mut write_lock = self.critical.write();

        // Grab existing entry or create a new entry.
@@ -533,65 +552,67 @@ impl<T: BeaconChainTypes> DataAvailabilityCheckerInner<T> {

        let block_import_requirement = self.block_import_requirement(epoch)?;

-        // Potentially trigger reconstruction if:
-        // - Our custody requirement is all columns
-        // - We >= 50% of columns
-        let data_columns_to_publish =
-            if self.should_reconstruct(&block_import_requirement, &pending_components) {
-                pending_components.reconstruction_started();
-
-                let timer = metrics::start_timer(&metrics::DATA_AVAILABILITY_RECONSTRUCTION_TIME);
-
-                let existing_column_indices = pending_components
-                    .verified_data_columns
-                    .iter()
-                    .map(|d| d.index())
-                    .collect::<HashSet<_>>();
-
-                // Will only return an error if:
-                // - < 50% of columns
-                // - There are duplicates
-                let all_data_columns = KzgVerifiedCustodyDataColumn::reconstruct_columns(
-                    kzg,
-                    pending_components.verified_data_columns.as_slice(),
-                    &self.spec,
-                )
-                .map_err(AvailabilityCheckError::ReconstructColumnsError)?;
-
-                let data_columns_to_publish = all_data_columns
-                    .iter()
-                    .filter(|d| !existing_column_indices.contains(&d.index()))
-                    .map(|d| d.clone_arc())
-                    .collect::<Vec<_>>();
-
-                pending_components.verified_data_columns = all_data_columns;
-
-                metrics::stop_timer(timer);
-                metrics::inc_counter_by(
-                    &metrics::DATA_AVAILABILITY_RECONSTRUCTED_COLUMNS,
-                    data_columns_to_publish.len() as u64,
-                );
-
-                Some(data_columns_to_publish)
-            } else {
-                None
-            };
-
-        if pending_components.is_available(&block_import_requirement) {
+        if pending_components.is_available(&block_import_requirement, log) {
            write_lock.put(block_root, pending_components.clone());
            // No need to hold the write lock anymore
            drop(write_lock);
-            pending_components
-                .make_available(block_import_requirement, &self.spec, |diet_block| {
-                    self.state_cache.recover_pending_executed_block(diet_block)
-                })
-                .map(|availability| (availability, data_columns_to_publish))
+            pending_components.make_available(block_import_requirement, &self.spec, |diet_block| {
+                self.state_cache.recover_pending_executed_block(diet_block)
+            })
        } else {
            write_lock.put(block_root, pending_components);
-            Ok((
-                Availability::MissingComponents(block_root),
-                data_columns_to_publish,
-            ))
+            Ok(Availability::MissingComponents(block_root))
+        }
+    }
+
+    /// Check whether data column reconstruction should be attempted.
+    ///
+    /// Potentially trigger reconstruction if:
+    ///  - Our custody requirement is all columns (supernode), and we haven't got all columns
+    ///  - We have >= 50% of columns, but not all columns
+    ///  - Reconstruction hasn't been started for the block
+    ///
+    /// If reconstruction is required, returns `PendingComponents` which contains the
+    /// components to be used as inputs to reconstruction, otherwise returns a `reason`.
+    pub fn check_and_set_reconstruction_started(
+        &self,
+        block_root: &Hash256,
+    ) -> ReconstructColumnsDecision<T::EthSpec> {
+        let mut write_lock = self.critical.write();
+        let Some(pending_components) = write_lock.get_mut(block_root) else {
+            // Block may have been imported as it does not exist in availability cache.
+            return ReconstructColumnsDecision::No("block already imported");
+        };
+
+        // If we're sampling all columns, it means we must be custodying all columns.
+        let custody_column_count = self.sampling_column_count();
+        let total_column_count = self.spec.number_of_columns;
+        let received_column_count = pending_components.verified_data_columns.len();
+
+        if pending_components.reconstruction_started {
+            return ReconstructColumnsDecision::No("already started");
+        }
+        if custody_column_count != total_column_count {
+            return ReconstructColumnsDecision::No("not required for full node");
+        }
+        if received_column_count == self.spec.number_of_columns {
+            return ReconstructColumnsDecision::No("all columns received");
+        }
+        if received_column_count < total_column_count / 2 {
+            return ReconstructColumnsDecision::No("not enough columns");
+        }
+
+        pending_components.reconstruction_started = true;
+        ReconstructColumnsDecision::Yes(pending_components.clone())
+    }
+
+    /// This could mean some invalid data columns made it through to the `DataAvailabilityChecker`.
+    /// In this case, we remove all data columns in `PendingComponents`, reset reconstruction
+    /// status so that we can attempt to retrieve columns from peers again.
+    pub fn handle_reconstruction_failure(&self, block_root: &Hash256) {
+        if let Some(pending_components_mut) = self.critical.write().get_mut(block_root) {
+            pending_components_mut.verified_data_columns = vec![];
+            pending_components_mut.reconstruction_started = false;
        }
    }

@@ -600,6 +621,7 @@ impl<T: BeaconChainTypes> DataAvailabilityCheckerInner<T> {
    pub fn put_pending_executed_block(
        &self,
        executed_block: AvailabilityPendingExecutedBlock<T::EthSpec>,
+        log: &Logger,
    ) -> Result<Availability<T::EthSpec>, AvailabilityCheckError> {
        let mut write_lock = self.critical.write();
        let block_root = executed_block.import_data.block_root;
@@ -621,7 +643,7 @@ impl<T: BeaconChainTypes> DataAvailabilityCheckerInner<T> {

        // Check if we have all components and entire set is consistent.
        let block_import_requirement = self.block_import_requirement(epoch)?;
-        if pending_components.is_available(&block_import_requirement) {
+        if pending_components.is_available(&block_import_requirement, log) {
            write_lock.put(block_root, pending_components.clone());
            // No need to hold the write lock anymore
            drop(write_lock);
@@ -919,7 +941,7 @@ mod test {
        );
        assert!(cache.critical.read().is_empty(), "cache should be empty");
        let availability = cache
-            .put_pending_executed_block(pending_block)
+            .put_pending_executed_block(pending_block, harness.logger())
            .expect("should put block");
        if blobs_expected == 0 {
            assert!(
@@ -958,7 +980,7 @@ mod test {
        for (blob_index, gossip_blob) in blobs.into_iter().enumerate() {
            kzg_verified_blobs.push(gossip_blob.into_inner());
            let availability = cache
-                .put_kzg_verified_blobs(root, epoch, kzg_verified_blobs.clone())
+                .put_kzg_verified_blobs(root, epoch, kzg_verified_blobs.clone(), harness.logger())
                .expect("should put blob");
            if blob_index == blobs_expected - 1 {
                assert!(matches!(availability, Availability::Available(_)));
@@ -985,7 +1007,7 @@ mod test {
        for gossip_blob in blobs {
            kzg_verified_blobs.push(gossip_blob.into_inner());
            let availability = cache
-                .put_kzg_verified_blobs(root, epoch, kzg_verified_blobs.clone())
+                .put_kzg_verified_blobs(root, epoch, kzg_verified_blobs.clone(), harness.logger())
                .expect("should put blob");
            assert_eq!(
                availability,
@@ -995,7 +1017,7 @@ mod test {
            assert_eq!(cache.critical.read().len(), 1);
        }
        let availability = cache
-            .put_pending_executed_block(pending_block)
+            .put_pending_executed_block(pending_block, harness.logger())
            .expect("should put block");
        assert!(
            matches!(availability, Availability::Available(_)),
@@ -1063,7 +1085,7 @@ mod test {

            // put the block in the cache
            let availability = cache
-                .put_pending_executed_block(pending_block)
+                .put_pending_executed_block(pending_block, harness.logger())
                .expect("should put block");

            // grab the diet block from the cache for later testing
--- a/beacon_node/beacon_chain/src/data_column_verification.rs
+++ b/beacon_node/beacon_chain/src/data_column_verification.rs
@@ -313,10 +313,7 @@ impl<E: EthSpec> KzgVerifiedCustodyDataColumn<E> {
        kzg: &Kzg,
        partial_set_of_columns: &[Self],
        spec: &ChainSpec,
-    ) -> Result<Vec<Self>, KzgError> {
-        // Will only return an error if:
-        // - < 50% of columns
-        // - There are duplicates
+    ) -> Result<Vec<KzgVerifiedCustodyDataColumn<E>>, KzgError> {
        let all_data_columns = reconstruct_data_columns(
            kzg,
            &partial_set_of_columns
@@ -328,10 +325,8 @@ impl<E: EthSpec> KzgVerifiedCustodyDataColumn<E> {

        Ok(all_data_columns
            .into_iter()
-            .map(|d| {
-                KzgVerifiedCustodyDataColumn::from_asserted_custody(KzgVerifiedDataColumn {
-                    data: d,
-                })
+            .map(|data| {
+                KzgVerifiedCustodyDataColumn::from_asserted_custody(KzgVerifiedDataColumn { data })
            })
            .collect::<Vec<_>>())
    }
--- a/beacon_node/beacon_chain/src/metrics.rs
+++ b/beacon_node/beacon_chain/src/metrics.rs
@@ -1887,6 +1887,31 @@ pub static DATA_AVAILABILITY_RECONSTRUCTED_COLUMNS: LazyLock<Result<IntCounter>>
        )
    });

+pub static KZG_DATA_COLUMN_RECONSTRUCTION_ATTEMPTS: LazyLock<Result<IntCounter>> =
+    LazyLock::new(|| {
+        try_create_int_counter(
+            "kzg_data_column_reconstruction_attempts",
+            "Count of times data column reconstruction has been attempted",
+        )
+    });
+
+pub static KZG_DATA_COLUMN_RECONSTRUCTION_FAILURES: LazyLock<Result<IntCounter>> =
+    LazyLock::new(|| {
+        try_create_int_counter(
+            "kzg_data_column_reconstruction_failures",
+            "Count of times data column reconstruction has failed",
+        )
+    });
+
+pub static KZG_DATA_COLUMN_RECONSTRUCTION_INCOMPLETE_TOTAL: LazyLock<Result<IntCounterVec>> =
+    LazyLock::new(|| {
+        try_create_int_counter_vec(
+            "kzg_data_column_reconstruction_incomplete_total",
+            "Count of times data column reconstruction attempts did not result in an import",
+            &["reason"],
+        )
+    });
+
 /*
 * light_client server metrics
 */
--- a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs
+++ b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs
@@ -4,6 +4,7 @@ use crate::{
    service::NetworkMessage,
    sync::SyncMessage,
 };
+use beacon_chain::blob_verification::{GossipBlobError, GossipVerifiedBlob};
 use beacon_chain::block_verification_types::AsBlock;
 use beacon_chain::data_column_verification::{GossipDataColumnError, GossipVerifiedDataColumn};
 use beacon_chain::store::Error;
@@ -18,13 +19,7 @@ use beacon_chain::{
    AvailabilityProcessingStatus, BeaconChainError, BeaconChainTypes, BlockError, ForkChoiceError,
    GossipVerifiedBlock, NotifyExecutionLayer,
 };
-use beacon_chain::{
-    blob_verification::{GossipBlobError, GossipVerifiedBlob},
-    data_availability_checker::DataColumnsToPublish,
-};
-use lighthouse_network::{
-    Client, MessageAcceptance, MessageId, PeerAction, PeerId, PubsubMessage, ReportSource,
-};
+use lighthouse_network::{Client, MessageAcceptance, MessageId, PeerAction, PeerId, ReportSource};
 use operation_pool::ReceivedPreCapella;
 use slog::{crit, debug, error, info, trace, warn, Logger};
 use slot_clock::SlotClock;
@@ -171,26 +166,6 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
        })
    }

-    pub(crate) fn handle_data_columns_to_publish(
-        &self,
-        data_columns_to_publish: DataColumnsToPublish<T::EthSpec>,
-    ) {
-        if let Some(data_columns_to_publish) = data_columns_to_publish {
-            self.send_network_message(NetworkMessage::Publish {
-                messages: data_columns_to_publish
-                    .iter()
-                    .map(|d| {
-                        let subnet = DataColumnSubnetId::from_column_index::<T::EthSpec>(
-                            d.index as usize,
-                            &self.chain.spec,
-                        );
-                        PubsubMessage::DataColumnSidecar(Box::new((subnet, d.clone())))
-                    })
-                    .collect(),
-            });
-        }
-    }
-
    /// Send a message on `message_tx` that the `message_id` sent by `peer_id` should be propagated on
    /// the gossip network.
    ///
@@ -1022,9 +997,7 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
            .process_gossip_data_columns(vec![verified_data_column], || Ok(()))
            .await
        {
-            Ok((availability, data_columns_to_publish)) => {
-                self.handle_data_columns_to_publish(data_columns_to_publish);
-
+            Ok(availability) => {
                match availability {
                    AvailabilityProcessingStatus::Imported(block_root) => {
                        // Note: Reusing block imported metric here
@@ -1052,7 +1025,7 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
                            "block_root" => %block_root,
                        );

-                        // Potentially trigger reconstruction
+                        self.attempt_data_column_reconstruction(block_root).await;
                    }
                }
            }
--- a/beacon_node/network/src/network_beacon_processor/mod.rs
+++ b/beacon_node/network/src/network_beacon_processor/mod.rs
@@ -2,7 +2,9 @@ use crate::sync::manager::BlockProcessType;
 use crate::sync::SamplingId;
 use crate::{service::NetworkMessage, sync::manager::SyncMessage};
 use beacon_chain::block_verification_types::RpcBlock;
-use beacon_chain::{builder::Witness, eth1_chain::CachingEth1Backend, BeaconChain};
+use beacon_chain::{
+    builder::Witness, eth1_chain::CachingEth1Backend, AvailabilityProcessingStatus, BeaconChain,
+};
 use beacon_chain::{BeaconChainTypes, NotifyExecutionLayer};
 use beacon_processor::{
    work_reprocessing_queue::ReprocessQueueMessage, BeaconProcessorChannels, BeaconProcessorSend,
@@ -16,9 +18,9 @@ use lighthouse_network::rpc::methods::{
 use lighthouse_network::rpc::{RequestId, SubstreamId};
 use lighthouse_network::{
    rpc::{BlocksByRangeRequest, BlocksByRootRequest, LightClientBootstrapRequest, StatusMessage},
-    Client, MessageId, NetworkGlobals, PeerId,
+    Client, MessageId, NetworkGlobals, PeerId, PubsubMessage,
 };
-use slog::{debug, Logger};
+use slog::{debug, error, trace, Logger};
 use slot_clock::ManualSlotClock;
 use std::path::PathBuf;
 use std::sync::Arc;
@@ -848,6 +850,75 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
                "error" => %e)
        });
    }
+
+    /// Attempt to reconstruct all data columns if the following conditions satisfies:
+    /// - Our custody requirement is all columns
+    /// - We have >= 50% of columns, but not all columns
+    ///
+    /// Returns `Some(AvailabilityProcessingStatus)` if reconstruction is successfully performed,
+    /// otherwise returns `None`.
+    async fn attempt_data_column_reconstruction(
+        &self,
+        block_root: Hash256,
+    ) -> Option<AvailabilityProcessingStatus> {
+        let result = self.chain.reconstruct_data_columns(block_root).await;
+        match result {
+            Ok(Some((availability_processing_status, data_columns_to_publish))) => {
+                self.send_network_message(NetworkMessage::Publish {
+                    messages: data_columns_to_publish
+                        .iter()
+                        .map(|d| {
+                            let subnet = DataColumnSubnetId::from_column_index::<T::EthSpec>(
+                                d.index as usize,
+                                &self.chain.spec,
+                            );
+                            PubsubMessage::DataColumnSidecar(Box::new((subnet, d.clone())))
+                        })
+                        .collect(),
+                });
+
+                match &availability_processing_status {
+                    AvailabilityProcessingStatus::Imported(hash) => {
+                        debug!(
+                            self.log,
+                            "Block components available via reconstruction";
+                            "result" => "imported block and custody columns",
+                            "block_hash" => %hash,
+                        );
+                        self.chain.recompute_head_at_current_slot().await;
+                    }
+                    AvailabilityProcessingStatus::MissingComponents(_, _) => {
+                        debug!(
+                            self.log,
+                            "Block components still missing block after reconstruction";
+                            "result" => "imported all custody columns",
+                            "block_hash" => %block_root,
+                        );
+                    }
+                }
+
+                Some(availability_processing_status)
+            }
+            Ok(None) => {
+                // reason is tracked via the `KZG_DATA_COLUMN_RECONSTRUCTION_INCOMPLETE_TOTAL` metric
+                trace!(
+                    self.log,
+                    "Reconstruction not required for block";
+                    "block_hash" => %block_root,
+                );
+                None
+            }
+            Err(e) => {
+                error!(
+                    self.log,
+                    "Error during data column reconstruction";
+                    "block_root" => %block_root,
+                    "error" => ?e
+                );
+                None
+            }
+        }
+    }
 }

 type TestBeaconChainType<E> =
--- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs
+++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs
@@ -327,34 +327,37 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
        _seen_timestamp: Duration,
        process_type: BlockProcessType,
    ) {
-        let result = self
+        let mut result = self
            .chain
            .process_rpc_custody_columns(custody_columns)
            .await;

        match &result {
-            Ok((availability, data_columns_to_publish)) => {
-                self.handle_data_columns_to_publish(data_columns_to_publish.clone());
-
-                match availability {
-                    AvailabilityProcessingStatus::Imported(hash) => {
-                        debug!(
-                            self.log,
-                            "Block components retrieved";
-                            "result" => "imported block and custody columns",
-                            "block_hash" => %hash,
-                        );
-                        self.chain.recompute_head_at_current_slot().await;
-                    }
-                    AvailabilityProcessingStatus::MissingComponents(_, _) => {
-                        debug!(
-                            self.log,
-                            "Missing components over rpc";
-                            "block_hash" => %block_root,
-                        );
+            Ok(availability) => match availability {
+                AvailabilityProcessingStatus::Imported(hash) => {
+                    debug!(
+                        self.log,
+                        "Block components retrieved";
+                        "result" => "imported block and custody columns",
+                        "block_hash" => %hash,
+                    );
+                    self.chain.recompute_head_at_current_slot().await;
+                }
+                AvailabilityProcessingStatus::MissingComponents(_, _) => {
+                    debug!(
+                        self.log,
+                        "Missing components over rpc";
+                        "block_hash" => %block_root,
+                    );
+                    // Attempt reconstruction here before notifying sync, to avoid sending out more requests
+                    // that we may no longer need.
+                    if let Some(availability) =
+                        self.attempt_data_column_reconstruction(block_root).await
+                    {
+                        result = Ok(availability)
                    }
                }
-            }
+            },
            Err(BlockError::DuplicateFullyImported(_)) => {
                debug!(
                    self.log,
@@ -374,7 +377,7 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {

        self.send_sync_message(SyncMessage::BlockComponentProcessed {
            process_type,
-            result: result.map(|(r, _)| r).into(),
+            result: result.into(),
        });
    }