Instrument tracing spans for block processing and import (#7816)

#7815 - removes all existing spans, so some span fields that appear in logs like `service_name` may be lost. - instruments a few key code paths in the beacon node, starting from **root spans** named below: * Gossip block and blobs * `process_gossip_data_column_sidecar` * `process_gossip_blob` * `process_gossip_block` * Rpc block and blobs * `process_rpc_block` * `process_rpc_blobs` * `process_rpc_custody_columns` * Rpc blocks (range and backfill) * `process_chain_segment` * `PendingComponents` lifecycle * `pending_components` To test locally: * Run Grafana and Tempo with https://github.com/sigp/lighthouse-metrics/pull/57 * Run Lighthouse BN with `--telemetry-collector-url http://localhost:4317` Some captured traces can be found here: https://hackmd.io/@jimmygchen/r1sLOxPPeg Removing the old spans seem to have reduced the memory usage quite a lot - i think we were using them on long running tasks and too excessively: <img width="910" height="495" alt="image" src="https://github.com/user-attachments/assets/5208bbe4-53b2-4ead-bc71-0b782c788669" />
2026-03-06 10:11:44 +00:00 · 2025-08-08 15:32:22 +10:00
parent 6dfab22267
commit 40c2fd5ff4
52 changed files with 633 additions and 1164 deletions
--- a/beacon_node/network/src/sync/range_sync/chain.rs
+++ b/beacon_node/network/src/sync/range_sync/chain.rs
@@ -12,7 +12,7 @@ use lighthouse_network::{PeerAction, PeerId};
 use logging::crit;
 use std::collections::{btree_map::Entry, BTreeMap, HashSet};
 use strum::IntoStaticStr;
-use tracing::{debug, instrument, warn};
+use tracing::{debug, warn};
 use types::{ColumnIndex, Epoch, EthSpec, Hash256, Slot};

 /// Blocks are downloaded in batches from peers. This constant specifies how many epochs worth of
@@ -205,7 +205,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {

    /// A block has been received for a batch on this chain.
    /// If the block correctly completes the batch it will be processed if possible.
-    #[instrument(parent = None, fields(chain = self.id , service = "range_sync"), skip_all)]
    pub fn on_block_response(
        &mut self,
        network: &mut SyncNetworkContext<T>,
@@ -252,7 +251,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {

    /// Processes the batch with the given id.
    /// The batch must exist and be ready for processing
-    #[instrument(parent = None, fields(chain = self.id , service = "range_sync"), skip_all)]
    fn process_batch(
        &mut self,
        network: &mut SyncNetworkContext<T>,
@@ -300,7 +298,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
    }

    /// Processes the next ready batch, prioritizing optimistic batches over the processing target.
-    #[instrument(parent = None, fields(chain = self.id , service = "range_sync"), skip_all)]
    fn process_completed_batches(
        &mut self,
        network: &mut SyncNetworkContext<T>,
@@ -410,7 +407,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {

    /// The block processor has completed processing a batch. This function handles the result
    /// of the batch processor.
-    #[instrument(parent = None, fields(chain = self.id , service = "range_sync"), skip_all)]
    pub fn on_batch_process_result(
        &mut self,
        network: &mut SyncNetworkContext<T>,
@@ -565,7 +561,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
        }
    }

-    #[instrument(parent = None, fields(chain = self.id , service = "range_sync"), skip_all)]
    fn reject_optimistic_batch(
        &mut self,
        network: &mut SyncNetworkContext<T>,
@@ -600,7 +595,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
    /// If a previous batch has been validated and it had been re-processed, penalize the original
    /// peer.
    #[allow(clippy::modulo_one)]
-    #[instrument(parent = None, fields(chain = self.id , service = "range_sync"), skip_all)]
    fn advance_chain(&mut self, network: &mut SyncNetworkContext<T>, validating_epoch: Epoch) {
        // make sure this epoch produces an advancement
        if validating_epoch <= self.start_epoch {
@@ -704,7 +698,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
    /// These events occur when a peer has successfully responded with blocks, but the blocks we
    /// have received are incorrect or invalid. This indicates the peer has not performed as
    /// intended and can result in downvoting a peer.
-    #[instrument(parent = None, fields(service = self.id, network), skip_all)]
    fn handle_invalid_batch(
        &mut self,
        network: &mut SyncNetworkContext<T>,
@@ -764,7 +757,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
    /// This chain has been requested to start syncing.
    ///
    /// This could be new chain, or an old chain that is being resumed.
-    #[instrument(parent = None, fields(chain = self.id , service = "range_sync"), skip_all)]
    pub fn start_syncing(
        &mut self,
        network: &mut SyncNetworkContext<T>,
@@ -803,7 +795,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
    /// Add a peer to the chain.
    ///
    /// If the chain is active, this starts requesting batches from this peer.
-    #[instrument(parent = None, fields(chain = self.id , service = "range_sync"), skip_all)]
    pub fn add_peer(
        &mut self,
        network: &mut SyncNetworkContext<T>,
@@ -816,7 +807,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
    /// An RPC error has occurred.
    ///
    /// If the batch exists it is re-requested.
-    #[instrument(parent = None, fields(chain = self.id , service = "range_sync"), skip_all)]
    pub fn inject_error(
        &mut self,
        network: &mut SyncNetworkContext<T>,
@@ -900,7 +890,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
    }

    /// Requests the batch assigned to the given id from a given peer.
-    #[instrument(parent = None, fields(chain = self.id , service = "range_sync"), skip_all)]
    pub fn send_batch(
        &mut self,
        network: &mut SyncNetworkContext<T>,
@@ -979,7 +968,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
    }

    /// Retries partial column requests within the batch by creating new requests for the failed columns.
-    #[instrument(parent = None, fields(chain = self.id , service = "range_sync"), skip_all)]
    pub fn retry_partial_batch(
        &mut self,
        network: &mut SyncNetworkContext<T>,
@@ -1032,7 +1020,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {

    /// Kickstarts the chain by sending for processing batches that are ready and requesting more
    /// batches if needed.
-    #[instrument(parent = None, fields(chain = self.id , service = "range_sync"), skip_all)]
    pub fn resume(
        &mut self,
        network: &mut SyncNetworkContext<T>,
@@ -1045,7 +1032,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {

    /// Attempts to request the next required batches from the peer pool if the chain is syncing. It will exhaust the peer
    /// pool and left over batches until the batch buffer is reached or all peers are exhausted.
-    #[instrument(parent = None, fields(chain = self.id , service = "range_sync"), skip_all)]
    fn request_batches(&mut self, network: &mut SyncNetworkContext<T>) -> ProcessingResult {
        if !matches!(self.state, ChainSyncingState::Syncing) {
            return Ok(KeepChain);
@@ -1114,7 +1100,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {

    /// Creates the next required batch from the chain. If there are no more batches required,
    /// `false` is returned.
-    #[instrument(parent = None, fields(chain = self.id , service = "range_sync"), skip_all)]
    fn include_next_batch(&mut self, network: &mut SyncNetworkContext<T>) -> Option<BatchId> {
        // don't request batches beyond the target head slot
        if self