Instrument tracing spans for block processing and import (#7816)

#7815

- removes all existing spans, so some span fields that appear in logs like `service_name` may be lost.
- instruments a few key code paths in the beacon node, starting from **root spans** named below:

* Gossip block and blobs
* `process_gossip_data_column_sidecar`
* `process_gossip_blob`
* `process_gossip_block`
* Rpc block and blobs
* `process_rpc_block`
* `process_rpc_blobs`
* `process_rpc_custody_columns`
* Rpc blocks (range and backfill)
* `process_chain_segment`
* `PendingComponents` lifecycle
* `pending_components`

To test locally:
* Run Grafana and Tempo with https://github.com/sigp/lighthouse-metrics/pull/57
* Run Lighthouse BN with `--telemetry-collector-url http://localhost:4317`

Some captured traces can be found here: https://hackmd.io/@jimmygchen/r1sLOxPPeg

Removing the old spans seem to have reduced the memory usage quite a lot - i think we were using them on long running tasks and too excessively:
<img width="910" height="495" alt="image" src="https://github.com/user-attachments/assets/5208bbe4-53b2-4ead-bc71-0b782c788669" />
This commit is contained in:
Jimmy Chen
2025-08-08 15:32:22 +10:00
committed by GitHub
parent 6dfab22267
commit 40c2fd5ff4
52 changed files with 633 additions and 1164 deletions

View File

@@ -27,7 +27,7 @@ use std::collections::{
HashSet,
};
use std::sync::Arc;
use tracing::{debug, error, info, instrument, warn};
use tracing::{debug, error, info, warn};
use types::{Epoch, EthSpec};
/// Blocks are downloaded in batches from peers. This constant specifies how many epochs worth of
@@ -147,10 +147,6 @@ pub struct BackFillSync<T: BeaconChainTypes> {
}
impl<T: BeaconChainTypes> BackFillSync<T> {
#[instrument(parent = None,
name = "backfill_sync",
skip_all
)]
pub fn new(
beacon_chain: Arc<BeaconChain<T>>,
network_globals: Arc<NetworkGlobals<T::EthSpec>>,
@@ -191,11 +187,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
}
/// Pauses the backfill sync if it's currently syncing.
#[instrument(parent = None,
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
pub fn pause(&mut self) {
if let BackFillState::Syncing = self.state() {
debug!(processed_epochs = %self.validated_batches, to_be_processed = %self.current_start,"Backfill sync paused");
@@ -207,11 +198,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
///
/// If resuming is successful, reports back the current syncing metrics.
#[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"]
#[instrument(parent = None,
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
pub fn start(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -287,11 +273,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// A fully synced peer has joined us.
/// If we are in a failed state, update a local variable to indicate we are able to restart
/// the failed sync on the next attempt.
#[instrument(parent = None,
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
pub fn fully_synced_peer_joined(&mut self) {
if matches!(self.state(), BackFillState::Failed) {
self.restart_failed_sync = true;
@@ -300,11 +281,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// A peer has disconnected.
/// If the peer has active batches, those are considered failed and re-requested.
#[instrument(parent = None,
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
#[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"]
pub fn peer_disconnected(&mut self, peer_id: &PeerId) -> Result<(), BackFillError> {
if matches!(self.state(), BackFillState::Failed) {
@@ -319,11 +295,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// An RPC error has occurred.
///
/// If the batch exists it is re-requested.
#[instrument(parent = None,
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
#[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"]
pub fn inject_error(
&mut self,
@@ -361,11 +332,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// If this returns an error, the backfill sync has failed and will be restarted once new peers
/// join the system.
/// The sync manager should update the global sync state on failure.
#[instrument(parent = None,
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
#[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"]
pub fn on_block_response(
&mut self,
@@ -417,11 +383,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// The syncing process has failed.
///
/// This resets past variables, to allow for a fresh start when resuming.
#[instrument(parent = None,
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn fail_sync(&mut self, error: BackFillError) -> Result<(), BackFillError> {
// Some errors shouldn't fail the chain.
if matches!(error, BackFillError::Paused) {
@@ -453,11 +414,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// Processes the batch with the given id.
/// The batch must exist and be ready for processing
#[instrument(parent = None,
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn process_batch(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -516,11 +472,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// The block processor has completed processing a batch. This function handles the result
/// of the batch processor.
/// If an error is returned the BackFill sync has failed.
#[instrument(parent = None,
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
#[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"]
pub fn on_batch_process_result(
&mut self,
@@ -673,11 +624,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
}
/// Processes the next ready batch.
#[instrument(parent = None,
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn process_completed_batches(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -741,11 +687,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
///
/// If a previous batch has been validated and it had been re-processed, penalize the original
/// peer.
#[instrument(parent = None,
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn advance_chain(&mut self, network: &mut SyncNetworkContext<T>, validating_epoch: Epoch) {
// make sure this epoch produces an advancement
if validating_epoch >= self.current_start {
@@ -837,11 +778,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// These events occur when a peer has successfully responded with blocks, but the blocks we
/// have received are incorrect or invalid. This indicates the peer has not performed as
/// intended and can result in downvoting a peer.
#[instrument(parent = None,
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn handle_invalid_batch(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -893,11 +829,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
}
/// Requests the batch assigned to the given id from a given peer.
#[instrument(parent = None,
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn send_batch(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -969,11 +900,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// When resuming a chain, this function searches for batches that need to be re-downloaded and
/// transitions their state to redownload the batch.
#[instrument(parent = None,
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn resume_batches(&mut self, network: &mut SyncNetworkContext<T>) -> Result<(), BackFillError> {
let batch_ids_to_retry = self
.batches
@@ -998,11 +924,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// Attempts to request the next required batches from the peer pool if the chain is syncing. It will exhaust the peer
/// pool and left over batches until the batch buffer is reached or all peers are exhausted.
#[instrument(parent = None,
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn request_batches(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -1027,11 +948,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// Creates the next required batch from the chain. If there are no more batches required,
/// `false` is returned.
#[instrument(parent = None,
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn include_next_batch(&mut self, network: &mut SyncNetworkContext<T>) -> Option<BatchId> {
// don't request batches beyond genesis;
if self.last_batch_downloaded {
@@ -1093,11 +1009,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
///
/// This errors if the beacon chain indicates that backfill sync has already completed or is
/// not required.
#[instrument(parent = None,
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn reset_start_epoch(&mut self) -> Result<(), ResetEpochError> {
let anchor_info = self.beacon_chain.store.get_anchor_info();
if anchor_info.block_backfill_complete(self.beacon_chain.genesis_backfill_slot) {
@@ -1111,11 +1022,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
}
/// Checks with the beacon chain if backfill sync has completed.
#[instrument(parent = None,
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn check_completed(&mut self) -> bool {
if self.would_complete(self.current_start) {
// Check that the beacon chain agrees
@@ -1131,11 +1037,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
}
/// Checks if backfill would complete by syncing to `start_epoch`.
#[instrument(parent = None,
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn would_complete(&self, start_epoch: Epoch) -> bool {
start_epoch
<= self
@@ -1145,20 +1046,10 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
}
/// Updates the global network state indicating the current state of a backfill sync.
#[instrument(parent = None,
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn set_state(&self, state: BackFillState) {
*self.network_globals.backfill_state.write() = state;
}
#[instrument(parent = None,
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn state(&self) -> BackFillState {
self.network_globals.backfill_state.read().clone()
}