Instrument tracing spans for block processing and import (#7816)

#7815

- removes all existing spans, so some span fields that appear in logs like `service_name` may be lost.
- instruments a few key code paths in the beacon node, starting from **root spans** named below:

* Gossip block and blobs
* `process_gossip_data_column_sidecar`
* `process_gossip_blob`
* `process_gossip_block`
* Rpc block and blobs
* `process_rpc_block`
* `process_rpc_blobs`
* `process_rpc_custody_columns`
* Rpc blocks (range and backfill)
* `process_chain_segment`
* `PendingComponents` lifecycle
* `pending_components`

To test locally:
* Run Grafana and Tempo with https://github.com/sigp/lighthouse-metrics/pull/57
* Run Lighthouse BN with `--telemetry-collector-url http://localhost:4317`

Some captured traces can be found here: https://hackmd.io/@jimmygchen/r1sLOxPPeg

Removing the old spans seem to have reduced the memory usage quite a lot - i think we were using them on long running tasks and too excessively:
<img width="910" height="495" alt="image" src="https://github.com/user-attachments/assets/5208bbe4-53b2-4ead-bc71-0b782c788669" />
This commit is contained in:
Jimmy Chen
2025-08-08 15:32:22 +10:00
committed by GitHub
parent 6dfab22267
commit 40c2fd5ff4
52 changed files with 633 additions and 1164 deletions

View File

@@ -14,7 +14,7 @@ use parking_lot::RwLock;
use std::cmp::Ordering;
use std::num::NonZeroUsize;
use std::sync::Arc;
use tracing::debug;
use tracing::{debug, debug_span, Span};
use types::blob_sidecar::BlobIdentifier;
use types::{
BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Epoch, EthSpec,
@@ -31,6 +31,7 @@ pub struct PendingComponents<E: EthSpec> {
pub verified_data_columns: Vec<KzgVerifiedCustodyDataColumn<E>>,
pub executed_block: Option<DietAvailabilityPendingExecutedBlock<E>>,
pub reconstruction_started: bool,
span: Span,
}
impl<E: EthSpec> PendingComponents<E> {
@@ -87,6 +88,8 @@ impl<E: EthSpec> PendingComponents<E> {
/// Inserts a block into the cache.
pub fn insert_block(&mut self, block: DietAvailabilityPendingExecutedBlock<E>) {
let _guard = self.span.clone().entered();
debug!("Block added to pending components");
*self.get_cached_block_mut() = Some(block)
}
@@ -94,7 +97,9 @@ impl<E: EthSpec> PendingComponents<E> {
///
/// Existing blob at the index will be replaced.
pub fn insert_blob_at_index(&mut self, blob_index: usize, blob: KzgVerifiedBlob<E>) {
let _guard = self.span.clone().entered();
if let Some(b) = self.get_cached_blobs_mut().get_mut(blob_index) {
debug!(blob_index, "Blob added to pending components");
*b = Some(blob);
}
}
@@ -134,11 +139,17 @@ impl<E: EthSpec> PendingComponents<E> {
&mut self,
kzg_verified_data_columns: I,
) -> Result<(), AvailabilityCheckError> {
let _guard = self.span.clone().entered();
for data_column in kzg_verified_data_columns {
if self.get_cached_data_column(data_column.index()).is_none() {
debug!(
column_index = data_column.index(),
"Data column added to pending components"
);
self.verified_data_columns.push(data_column);
}
}
Ok(())
}
@@ -165,6 +176,7 @@ impl<E: EthSpec> PendingComponents<E> {
where
R: FnOnce(
DietAvailabilityPendingExecutedBlock<E>,
&Span,
) -> Result<AvailabilityPendingExecutedBlock<E>, AvailabilityCheckError>,
{
let Some(block) = &self.executed_block else {
@@ -254,7 +266,7 @@ impl<E: EthSpec> PendingComponents<E> {
block,
import_data,
payload_verification_outcome,
} = recover(block.clone())?;
} = recover(block.clone(), &self.span)?;
let available_block = AvailableBlock {
block_root: self.block_root,
@@ -263,6 +275,10 @@ impl<E: EthSpec> PendingComponents<E> {
blobs_available_timestamp,
spec: spec.clone(),
};
self.span.in_scope(|| {
debug!("Block and all data components are available");
});
Ok(Some(AvailableExecutedBlock::new(
available_block,
import_data,
@@ -272,12 +288,15 @@ impl<E: EthSpec> PendingComponents<E> {
/// Returns an empty `PendingComponents` object with the given block root.
pub fn empty(block_root: Hash256, max_len: usize) -> Self {
let span = debug_span!(parent: None, "pending_components", %block_root);
let _guard = span.clone().entered();
Self {
block_root,
verified_blobs: RuntimeFixedVector::new(vec![None; max_len]),
verified_data_columns: vec![],
executed_block: None,
reconstruction_started: false,
span,
}
}
@@ -482,7 +501,7 @@ impl<T: BeaconChainTypes> DataAvailabilityCheckerInner<T> {
&self.spec,
self.custody_context
.num_of_data_columns_to_sample(epoch, &self.spec),
|block| self.state_cache.recover_pending_executed_block(block),
|block, span| self.state_cache.recover_pending_executed_block(block, span),
)? {
// We keep the pending components in the availability cache during block import (#5845).
write_lock.put(block_root, pending_components);
@@ -538,8 +557,8 @@ impl<T: BeaconChainTypes> DataAvailabilityCheckerInner<T> {
);
if let Some(available_block) =
pending_components.make_available(&self.spec, num_expected_columns, |block| {
self.state_cache.recover_pending_executed_block(block)
pending_components.make_available(&self.spec, num_expected_columns, |block, span| {
self.state_cache.recover_pending_executed_block(block, span)
})?
{
// We keep the pending components in the availability cache during block import (#5845).
@@ -637,8 +656,8 @@ impl<T: BeaconChainTypes> DataAvailabilityCheckerInner<T> {
// Check if we have all components and entire set is consistent.
if let Some(available_block) =
pending_components.make_available(&self.spec, num_expected_columns, |block| {
self.state_cache.recover_pending_executed_block(block)
pending_components.make_available(&self.spec, num_expected_columns, |block, span| {
self.state_cache.recover_pending_executed_block(block, span)
})?
{
// We keep the pending components in the availability cache during block import (#5845).
@@ -712,7 +731,7 @@ mod test {
use std::collections::VecDeque;
use store::{database::interface::BeaconNodeBackend, HotColdDB, ItemStore, StoreConfig};
use tempfile::{tempdir, TempDir};
use tracing::info;
use tracing::{debug_span, info};
use types::non_zero_usize::new_non_zero_usize;
use types::{ExecPayload, MinimalEthSpec};
@@ -1107,7 +1126,7 @@ mod test {
// reconstruct the pending block by replaying the block on the parent state
let recovered_pending_block = cache
.state_lru_cache()
.recover_pending_executed_block(diet_block)
.recover_pending_executed_block(diet_block, &debug_span!("test"))
.expect("should reconstruct pending block");
// assert the recovered state is the same as the original
@@ -1133,7 +1152,7 @@ mod test {
// recover the pending block from the cache
let recovered_pending_block = cache
.state_lru_cache()
.recover_pending_executed_block(diet_block)
.recover_pending_executed_block(diet_block, &debug_span!("test"))
.expect("should reconstruct pending block");
// assert the recovered state is the same as the original
assert_eq!(

View File

@@ -9,6 +9,7 @@ use parking_lot::RwLock;
use state_processing::BlockReplayer;
use std::sync::Arc;
use store::OnDiskConsensusContext;
use tracing::{debug_span, instrument, Span};
use types::beacon_block_body::KzgCommitments;
use types::{BeaconState, BlindedPayload, ChainSpec, Epoch, EthSpec, Hash256, SignedBeaconBlock};
@@ -106,9 +107,11 @@ impl<T: BeaconChainTypes> StateLRUCache<T> {
/// This method will first check the cache and if the state is not found
/// it will reconstruct the state by loading the parent state from disk and
/// replaying the block.
#[instrument(skip_all, parent = _span, level = "debug")]
pub fn recover_pending_executed_block(
&self,
diet_executed_block: DietAvailabilityPendingExecutedBlock<T::EthSpec>,
_span: &Span,
) -> Result<AvailabilityPendingExecutedBlock<T::EthSpec>, AvailabilityCheckError> {
let state = if let Some(state) = self.states.write().pop(&diet_executed_block.state_root) {
state
@@ -132,6 +135,7 @@ impl<T: BeaconChainTypes> StateLRUCache<T> {
/// Reconstruct the state by loading the parent state from disk and replaying
/// the block.
#[instrument(skip_all, level = "debug")]
fn reconstruct_state(
&self,
diet_executed_block: &DietAvailabilityPendingExecutedBlock<T::EthSpec>,
@@ -164,8 +168,11 @@ impl<T: BeaconChainTypes> StateLRUCache<T> {
.state_root_iter(state_roots.into_iter())
.minimal_block_root_verification();
let block_replayer = debug_span!("reconstruct_state_apply_blocks").in_scope(|| {
block_replayer.apply_blocks(vec![diet_executed_block.block.clone_as_blinded()], None)
});
block_replayer
.apply_blocks(vec![diet_executed_block.block.clone_as_blinded()], None)
.map(|block_replayer| block_replayer.into_state())
.and_then(|mut state| {
state