Instrument tracing spans for block processing and import (#7816)

#7815

- removes all existing spans, so some span fields that appear in logs like `service_name` may be lost.
- instruments a few key code paths in the beacon node, starting from **root spans** named below:

* Gossip block and blobs
* `process_gossip_data_column_sidecar`
* `process_gossip_blob`
* `process_gossip_block`
* Rpc block and blobs
* `process_rpc_block`
* `process_rpc_blobs`
* `process_rpc_custody_columns`
* Rpc blocks (range and backfill)
* `process_chain_segment`
* `PendingComponents` lifecycle
* `pending_components`

To test locally:
* Run Grafana and Tempo with https://github.com/sigp/lighthouse-metrics/pull/57
* Run Lighthouse BN with `--telemetry-collector-url http://localhost:4317`

Some captured traces can be found here: https://hackmd.io/@jimmygchen/r1sLOxPPeg

Removing the old spans seem to have reduced the memory usage quite a lot - i think we were using them on long running tasks and too excessively:
<img width="910" height="495" alt="image" src="https://github.com/user-attachments/assets/5208bbe4-53b2-4ead-bc71-0b782c788669" />
This commit is contained in:
Jimmy Chen
2025-08-08 15:32:22 +10:00
committed by GitHub
parent 6dfab22267
commit 40c2fd5ff4
52 changed files with 633 additions and 1164 deletions

View File

@@ -5,7 +5,7 @@ use futures::channel::mpsc::Sender;
use futures::prelude::*;
use std::sync::Weak;
use tokio::runtime::{Handle, Runtime};
use tracing::{debug, instrument};
use tracing::debug;
pub use tokio::task::JoinHandle;
@@ -81,6 +81,8 @@ pub struct TaskExecutor {
signal_tx: Sender<ShutdownReason>,
/// The name of the service for inclusion in the logger output.
// FIXME(sproul): delete?
#[allow(dead_code)]
service_name: String,
}
@@ -92,7 +94,6 @@ impl TaskExecutor {
/// This function should only be used during testing. In production, prefer to obtain an
/// instance of `Self` via a `environment::RuntimeContext` (see the `lighthouse/environment`
/// crate).
#[instrument(parent = None,fields(service = service_name), name = "task_executor", skip_all)]
pub fn new<T: Into<HandleProvider>>(
handle: T,
exit: async_channel::Receiver<()>,
@@ -108,7 +109,6 @@ impl TaskExecutor {
}
/// Clones the task executor adding a service name.
#[instrument(parent = None, fields(service = service_name), name = "task_executor", skip_all)]
pub fn clone_with_name(&self, service_name: String) -> Self {
TaskExecutor {
handle_provider: self.handle_provider.clone(),
@@ -124,7 +124,6 @@ impl TaskExecutor {
/// The purpose of this function is to create a compile error if some function which previously
/// returned `()` starts returning something else. Such a case may otherwise result in
/// accidental error suppression.
#[instrument(parent = None, fields(service = self.service_name), name = "task_executor", skip_all)]
pub fn spawn_ignoring_error(
&self,
task: impl Future<Output = Result<(), ()>> + Send + 'static,
@@ -136,7 +135,6 @@ impl TaskExecutor {
/// Spawn a task to monitor the completion of another task.
///
/// If the other task exits by panicking, then the monitor task will shut down the executor.
#[instrument(parent = None, fields(service = self.service_name), name = "task_executor", skip_all)]
fn spawn_monitor<R: Send>(
&self,
task_handle: impl Future<Output = Result<R, tokio::task::JoinError>> + Send + 'static,
@@ -175,7 +173,6 @@ impl TaskExecutor {
/// of a panic, the executor will be shut down via `self.signal_tx`.
///
/// This function generates prometheus metrics on number of tasks and task duration.
#[instrument(parent = None, fields(service = self.service_name), name = "task_executor", skip_all)]
pub fn spawn(&self, task: impl Future<Output = ()> + Send + 'static, name: &'static str) {
if let Some(task_handle) = self.spawn_handle(task, name) {
self.spawn_monitor(task_handle, name)
@@ -191,7 +188,6 @@ impl TaskExecutor {
/// This is useful in cases where the future to be spawned needs to do additional cleanup work when
/// the task is completed/canceled (e.g. writing local variables to disk) or the task is created from
/// some framework which does its own cleanup (e.g. a hyper server).
#[instrument(parent = None, fields(service = self.service_name), name = "task_executor", skip_all)]
pub fn spawn_without_exit(
&self,
task: impl Future<Output = ()> + Send + 'static,
@@ -235,7 +231,6 @@ impl TaskExecutor {
/// The task is cancelled when the corresponding async-channel is dropped.
///
/// This function generates prometheus metrics on number of tasks and task duration.
#[instrument(parent = None, fields(service = self.service_name), name = "task_executor", skip_all)]
pub fn spawn_handle<R: Send + 'static>(
&self,
task: impl Future<Output = R> + Send + 'static,
@@ -283,7 +278,6 @@ impl TaskExecutor {
/// The Future returned behaves like the standard JoinHandle which can return an error if the
/// task failed.
/// This function generates prometheus metrics on number of tasks and task duration.
#[instrument(parent = None, fields(service = self.service_name), name = "task_executor", skip_all)]
pub fn spawn_blocking_handle<F, R>(
&self,
task: F,
@@ -332,7 +326,6 @@ impl TaskExecutor {
/// a `tokio` context present in the thread-local storage due to some `rayon` funkiness. Talk to
/// @paulhauner if you plan to use this function in production. He has put metrics in here to
/// track any use of it, so don't think you can pull a sneaky one on him.
#[instrument(parent = None, fields(service = self.service_name), name = "task_executor", skip_all)]
pub fn block_on_dangerous<F: Future>(
&self,
future: F,
@@ -368,7 +361,6 @@ impl TaskExecutor {
}
/// Returns a `Handle` to the current runtime.
#[instrument(parent = None, fields(service = self.service_name), name = "task_executor", skip_all)]
pub fn handle(&self) -> Option<Handle> {
self.handle_provider.handle()
}
@@ -383,7 +375,6 @@ impl TaskExecutor {
}
/// Get a channel to request shutting down.
#[instrument(parent = None, fields(service = self.service_name), name = "task_executor", skip_all)]
pub fn shutdown_sender(&self) -> Sender<ShutdownReason> {
self.signal_tx.clone()
}