Instrument tracing spans for block processing and import (#7816)

#7815

- removes all existing spans, so some span fields that appear in logs like `service_name` may be lost.
- instruments a few key code paths in the beacon node, starting from **root spans** named below:

* Gossip block and blobs
* `process_gossip_data_column_sidecar`
* `process_gossip_blob`
* `process_gossip_block`
* Rpc block and blobs
* `process_rpc_block`
* `process_rpc_blobs`
* `process_rpc_custody_columns`
* Rpc blocks (range and backfill)
* `process_chain_segment`
* `PendingComponents` lifecycle
* `pending_components`

To test locally:
* Run Grafana and Tempo with https://github.com/sigp/lighthouse-metrics/pull/57
* Run Lighthouse BN with `--telemetry-collector-url http://localhost:4317`

Some captured traces can be found here: https://hackmd.io/@jimmygchen/r1sLOxPPeg

Removing the old spans seem to have reduced the memory usage quite a lot - i think we were using them on long running tasks and too excessively:
<img width="910" height="495" alt="image" src="https://github.com/user-attachments/assets/5208bbe4-53b2-4ead-bc71-0b782c788669" />
This commit is contained in:
Jimmy Chen
2025-08-08 15:32:22 +10:00
committed by GitHub
parent 6dfab22267
commit 40c2fd5ff4
52 changed files with 633 additions and 1164 deletions

View File

@@ -60,6 +60,9 @@ lighthouse_version = { workspace = true }
logging = { workspace = true }
malloc_utils = { workspace = true }
metrics = { workspace = true }
opentelemetry = { workspace = true }
opentelemetry-otlp = { workspace = true }
opentelemetry_sdk = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
serde_yaml = { workspace = true }
@@ -67,6 +70,7 @@ slasher = { workspace = true }
store = { workspace = true }
task_executor = { workspace = true }
tracing = { workspace = true }
tracing-opentelemetry = { workspace = true }
tracing-subscriber = { workspace = true }
types = { workspace = true }
unused_port = { workspace = true }

View File

@@ -19,6 +19,8 @@ use futures::TryFutureExt;
use lighthouse_version::VERSION;
use logging::{build_workspace_filter, crit, MetricsLayer};
use malloc_utils::configure_memory_allocator;
use opentelemetry::trace::TracerProvider;
use opentelemetry_otlp::WithExportConfig;
use std::backtrace::Backtrace;
use std::io::IsTerminal;
use std::path::PathBuf;
@@ -278,6 +280,18 @@ fn main() {
.default_value("info")
.display_order(0)
)
.arg(
Arg::new("telemetry-collector-url")
.long("telemetry-collector-url")
.value_name("URL")
.help(
"URL of the OpenTelemetry collector to export tracing spans \
(e.g., http://localhost:4317). If not set, tracing export is disabled.",
)
.action(ArgAction::Set)
.global(true)
.display_order(0)
)
.arg(
Arg::new("datadir")
.long("datadir")
@@ -677,6 +691,39 @@ fn run<E: EthSpec>(
logging_layers.push(MetricsLayer.boxed());
let mut environment = builder
.multi_threaded_tokio_runtime()?
.eth2_network_config(eth2_network_config)?
.build()?;
if let Some(telemetry_collector_url) = matches.get_one::<String>("telemetry-collector-url") {
let telemetry_layer = environment.runtime().block_on(async {
let exporter = opentelemetry_otlp::SpanExporter::builder()
.with_tonic()
.with_endpoint(telemetry_collector_url)
.build()
.map_err(|e| format!("Failed to create OTLP exporter: {:?}", e))?;
let provider = opentelemetry_sdk::trace::SdkTracerProvider::builder()
.with_batch_exporter(exporter)
.with_resource(
opentelemetry_sdk::Resource::builder()
.with_service_name("lighthouse")
.build(),
)
.build();
let tracer = provider.tracer("lighthouse");
Ok::<_, String>(
tracing_opentelemetry::layer()
.with_tracer(tracer)
.with_filter(workspace_filter),
)
})?;
logging_layers.push(telemetry_layer.boxed());
}
#[cfg(feature = "console-subscriber")]
{
let console_layer = console_subscriber::spawn();
@@ -691,11 +738,6 @@ fn run<E: EthSpec>(
eprintln!("Failed to initialize logger: {e}");
}
let mut environment = builder
.multi_threaded_tokio_runtime()?
.eth2_network_config(eth2_network_config)?
.build()?;
// Log panics properly.
{
std::panic::set_hook(Box::new(move |info| {