Instrument tracing spans for block processing and import (#7816)

#7815

- removes all existing spans, so some span fields that appear in logs like `service_name` may be lost.
- instruments a few key code paths in the beacon node, starting from **root spans** named below:

* Gossip block and blobs
* `process_gossip_data_column_sidecar`
* `process_gossip_blob`
* `process_gossip_block`
* Rpc block and blobs
* `process_rpc_block`
* `process_rpc_blobs`
* `process_rpc_custody_columns`
* Rpc blocks (range and backfill)
* `process_chain_segment`
* `PendingComponents` lifecycle
* `pending_components`

To test locally:
* Run Grafana and Tempo with https://github.com/sigp/lighthouse-metrics/pull/57
* Run Lighthouse BN with `--telemetry-collector-url http://localhost:4317`

Some captured traces can be found here: https://hackmd.io/@jimmygchen/r1sLOxPPeg

Removing the old spans seem to have reduced the memory usage quite a lot - i think we were using them on long running tasks and too excessively:
<img width="910" height="495" alt="image" src="https://github.com/user-attachments/assets/5208bbe4-53b2-4ead-bc71-0b782c788669" />
This commit is contained in:
Jimmy Chen
2025-08-08 15:32:22 +10:00
committed by GitHub
parent 6dfab22267
commit 40c2fd5ff4
52 changed files with 633 additions and 1164 deletions

View File

@@ -45,7 +45,7 @@ use std::collections::hash_map::Entry;
use std::sync::Arc;
use std::time::Duration;
use store::Hash256;
use tracing::{debug, error, instrument, warn};
use tracing::{debug, error, warn};
use types::{BlobSidecar, DataColumnSidecar, EthSpec, SignedBeaconBlock};
pub mod common;
@@ -127,7 +127,6 @@ use lighthouse_network::service::api_types::Id;
pub(crate) type BlockLookupSummary = (Id, Hash256, Option<Hash256>, Vec<PeerId>);
impl<T: BeaconChainTypes> BlockLookups<T> {
#[instrument(parent = None, fields(service = "lookup_sync"), name = "lookup_sync")]
pub fn new() -> Self {
Self {
failed_chains: LRUTimeCache::new(Duration::from_secs(
@@ -138,31 +137,16 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
}
#[cfg(test)]
#[instrument(parent = None,
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub(crate) fn insert_failed_chain(&mut self, block_root: Hash256) {
self.failed_chains.insert(block_root);
}
#[cfg(test)]
#[instrument(parent = None,
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub(crate) fn get_failed_chains(&mut self) -> Vec<Hash256> {
self.failed_chains.keys().cloned().collect()
}
#[cfg(test)]
#[instrument(parent = None,
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub(crate) fn active_single_lookups(&self) -> Vec<BlockLookupSummary> {
self.single_block_lookups
.iter()
@@ -171,11 +155,6 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
}
/// Returns a vec of all parent lookup chains by tip, in descending slot order (tip first)
#[instrument(parent = None,
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub(crate) fn active_parent_lookups(&self) -> Vec<NodeChain> {
compute_parent_chains(
&self
@@ -192,11 +171,6 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
/// If a parent lookup exists or is triggered, a current lookup will be created.
///
/// Returns true if the lookup is created or already exists
#[instrument(parent = None,
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
#[must_use = "only reference the new lookup if returns true"]
pub fn search_child_and_parent(
&mut self,
@@ -230,11 +204,6 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
/// Seach a block whose parent root is unknown.
///
/// Returns true if the lookup is created or already exists
#[instrument(parent = None,
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
#[must_use = "only reference the new lookup if returns true"]
pub fn search_unknown_block(
&mut self,
@@ -251,11 +220,6 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
/// - `block_root_to_search` is a failed chain
///
/// Returns true if the lookup is created or already exists
#[instrument(parent = None,
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
#[must_use = "only reference the new lookup if returns true"]
pub fn search_parent_of_child(
&mut self,
@@ -358,11 +322,6 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
/// Searches for a single block hash. If the blocks parent is unknown, a chain of blocks is
/// constructed.
/// Returns true if the lookup is created or already exists
#[instrument(parent = None,
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
#[must_use = "only reference the new lookup if returns true"]
fn new_current_lookup(
&mut self,
@@ -466,11 +425,6 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
/* Lookup responses */
/// Process a block or blob response received from a single lookup request.
#[instrument(parent = None,
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub fn on_download_response<R: RequestState<T>>(
&mut self,
id: SingleLookupReqId,
@@ -556,11 +510,6 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
/* Error responses */
#[instrument(parent = None,
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub fn peer_disconnected(&mut self, peer_id: &PeerId) {
for (_, lookup) in self.single_block_lookups.iter_mut() {
lookup.remove_peer(peer_id);
@@ -569,11 +518,6 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
/* Processing responses */
#[instrument(parent = None,
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub fn on_processing_result(
&mut self,
process_type: BlockProcessType,
@@ -594,11 +538,6 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
self.on_lookup_result(process_type.id(), lookup_result, "processing_result", cx);
}
#[instrument(parent = None,
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub fn on_processing_result_inner<R: RequestState<T>>(
&mut self,
lookup_id: SingleLookupId,
@@ -788,11 +727,6 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
}
}
#[instrument(parent = None,
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub fn on_external_processing_result(
&mut self,
block_root: Hash256,
@@ -818,11 +752,6 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
}
/// Makes progress on the immediate children of `block_root`
#[instrument(parent = None,
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub fn continue_child_lookups(&mut self, block_root: Hash256, cx: &mut SyncNetworkContext<T>) {
let mut lookup_results = vec![]; // < need to buffer lookup results to not re-borrow &mut self
@@ -848,11 +777,6 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
/// Drops `dropped_id` lookup and all its children recursively. Lookups awaiting a parent need
/// the parent to make progress to resolve, therefore we must drop them if the parent is
/// dropped.
#[instrument(parent = None,
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub fn drop_lookup_and_children(&mut self, dropped_id: SingleLookupId) {
if let Some(dropped_lookup) = self.single_block_lookups.remove(&dropped_id) {
debug!(
@@ -877,11 +801,6 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
/// Common handler a lookup request error, drop it and update metrics
/// Returns true if the lookup is created or already exists
#[instrument(parent = None,
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
fn on_lookup_result(
&mut self,
id: SingleLookupId,
@@ -919,22 +838,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
/* Helper functions */
/// Drops all the single block requests and returns how many requests were dropped.
#[instrument(parent = None,
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub fn drop_single_block_requests(&mut self) -> usize {
let requests_to_drop = self.single_block_lookups.len();
self.single_block_lookups.clear();
requests_to_drop
}
#[instrument(parent = None,
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub fn update_metrics(&self) {
metrics::set_gauge(
&metrics::SYNC_SINGLE_BLOCK_LOOKUPS,
@@ -943,11 +852,6 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
}
/// Perform some prune operations on lookups on some interval
#[instrument(parent = None,
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub fn prune_lookups(&mut self) {
self.drop_lookups_without_peers();
self.drop_stuck_lookups();
@@ -971,11 +875,6 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
///
/// Instead there's no negative for keeping lookups with no peers around for some time. If we
/// regularly prune them, it should not be a memory concern (TODO: maybe yes!).
#[instrument(parent = None,
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
fn drop_lookups_without_peers(&mut self) {
for (lookup_id, block_root) in self
.single_block_lookups
@@ -1013,11 +912,6 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
///
/// - One single clear warn level log per stuck incident
/// - If the original bug is sporadic, it reduces the time a node is stuck from forever to 15 min
#[instrument(parent = None,
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
fn drop_stuck_lookups(&mut self) {
// While loop to find and drop all disjoint trees of potentially stuck lookups.
while let Some(stuck_lookup) = self.single_block_lookups.values().find(|lookup| {
@@ -1055,11 +949,6 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
}
/// Recursively find the oldest ancestor lookup of another lookup
#[instrument(parent = None,
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
fn find_oldest_ancestor_lookup<'a>(
&'a self,
lookup: &'a SingleBlockLookup<T>,
@@ -1084,11 +973,6 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
/// Adds peers to a lookup and its ancestors recursively.
/// Note: Takes a `lookup_id` as argument to allow recursion on mutable lookups, without having
/// to duplicate the code to add peers to a lookup
#[instrument(parent = None,
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
fn add_peers_to_lookup_and_ancestors(
&mut self,
lookup_id: SingleLookupId,