Instrument tracing spans for block processing and import (#7816)

#7815

- removes all existing spans, so some span fields that appear in logs like `service_name` may be lost.
- instruments a few key code paths in the beacon node, starting from **root spans** named below:

* Gossip block and blobs
* `process_gossip_data_column_sidecar`
* `process_gossip_blob`
* `process_gossip_block`
* Rpc block and blobs
* `process_rpc_block`
* `process_rpc_blobs`
* `process_rpc_custody_columns`
* Rpc blocks (range and backfill)
* `process_chain_segment`
* `PendingComponents` lifecycle
* `pending_components`

To test locally:
* Run Grafana and Tempo with https://github.com/sigp/lighthouse-metrics/pull/57
* Run Lighthouse BN with `--telemetry-collector-url http://localhost:4317`

Some captured traces can be found here: https://hackmd.io/@jimmygchen/r1sLOxPPeg

Removing the old spans seem to have reduced the memory usage quite a lot - i think we were using them on long running tasks and too excessively:
<img width="910" height="495" alt="image" src="https://github.com/user-attachments/assets/5208bbe4-53b2-4ead-bc71-0b782c788669" />
This commit is contained in:
Jimmy Chen
2025-08-08 15:32:22 +10:00
committed by GitHub
parent 6dfab22267
commit 40c2fd5ff4
52 changed files with 633 additions and 1164 deletions

View File

@@ -45,7 +45,7 @@ use std::time::Duration;
#[cfg(test)]
use task_executor::TaskExecutor;
use tokio::sync::mpsc;
use tracing::{debug, error, span, warn, Level};
use tracing::{debug, error, warn};
use types::blob_sidecar::FixedBlobSidecarList;
use types::{
BlobSidecar, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, EthSpec, ForkContext,
@@ -267,12 +267,6 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
chain: Arc<BeaconChain<T>>,
fork_context: Arc<ForkContext>,
) -> Self {
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
SyncNetworkContext {
network_send,
execution_engine_state: EngineState::Online, // always assume `Online` at the start
@@ -374,13 +368,6 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
}
pub fn status_peers<C: ToStatusMessage>(&self, chain: &C, peers: impl Iterator<Item = PeerId>) {
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
let status_message = chain.status_message();
for peer_id in peers {
debug!(
@@ -776,13 +763,6 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
return Ok(LookupRequestResult::Pending("no peers"));
};
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
match self.chain.get_block_process_status(&block_root) {
// Unknown block, continue request to download
BlockProcessStatus::Unknown => {}
@@ -882,13 +862,6 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
return Ok(LookupRequestResult::Pending("no peers"));
};
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
let imported_blob_indexes = self
.chain
.data_availability_checker
@@ -953,13 +926,6 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
request: DataColumnsByRootSingleBlockRequest,
expect_max_responses: bool,
) -> Result<LookupRequestResult<DataColumnsByRootRequestId>, &'static str> {
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
let id = DataColumnsByRootRequestId {
id: self.next_id(),
requester,
@@ -1004,13 +970,6 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
block_root: Hash256,
lookup_peers: Arc<RwLock<HashSet<PeerId>>>,
) -> Result<LookupRequestResult, RpcRequestSendError> {
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
let custody_indexes_imported = self
.chain
.data_availability_checker
@@ -1212,26 +1171,12 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
}
pub fn update_execution_engine_state(&mut self, engine_state: EngineState) {
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
debug!(past_state = ?self.execution_engine_state, new_state = ?engine_state, "Sync's view on execution engine state updated");
self.execution_engine_state = engine_state;
}
/// Terminates the connection with the peer and bans them.
pub fn goodbye_peer(&mut self, peer_id: PeerId, reason: GoodbyeReason) {
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
self.network_send
.send(NetworkMessage::GoodbyePeer {
peer_id,
@@ -1245,13 +1190,6 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
/// Reports to the scoring algorithm the behaviour of a peer.
pub fn report_peer(&self, peer_id: PeerId, action: PeerAction, msg: &'static str) {
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
debug!(%peer_id, %action, %msg, "Sync reporting peer");
self.network_send
.send(NetworkMessage::ReportPeer {
@@ -1267,13 +1205,6 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
/// Subscribes to core topics.
pub fn subscribe_core_topics(&self) {
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
self.network_send
.send(NetworkMessage::SubscribeCoreTopics)
.unwrap_or_else(|e| {
@@ -1283,13 +1214,6 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
/// Sends an arbitrary network message.
fn send_network_msg(&self, msg: NetworkMessage<T::EthSpec>) -> Result<(), &'static str> {
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
self.network_send.send(msg).map_err(|_| {
debug!("Could not send message to the network service");
"Network channel send Failed"
@@ -1514,13 +1438,6 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
peer_id: PeerId,
resp: RpcResponseResult<Vec<Arc<DataColumnSidecar<T::EthSpec>>>>,
) -> Option<CustodyByRootResult<T::EthSpec>> {
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
// Note: need to remove the request to borrow self again below. Otherwise we can't
// do nested requests
let Some(mut request) = self.custody_by_root_requests.remove(&id.requester) else {
@@ -1540,13 +1457,6 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
request: ActiveCustodyRequest<T>,
result: CustodyRequestResult<T::EthSpec>,
) -> Option<CustodyByRootResult<T::EthSpec>> {
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
let result = result
.map_err(RpcResponseError::CustodyRequestError)
.transpose();
@@ -1574,13 +1484,6 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
block: Arc<SignedBeaconBlock<T::EthSpec>>,
seen_timestamp: Duration,
) -> Result<(), SendErrorProcessor> {
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
let beacon_processor = self
.beacon_processor_if_enabled()
.ok_or(SendErrorProcessor::ProcessorNotAvailable)?;
@@ -1613,13 +1516,6 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
blobs: FixedBlobSidecarList<T::EthSpec>,
seen_timestamp: Duration,
) -> Result<(), SendErrorProcessor> {
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
let beacon_processor = self
.beacon_processor_if_enabled()
.ok_or(SendErrorProcessor::ProcessorNotAvailable)?;
@@ -1651,13 +1547,6 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
seen_timestamp: Duration,
process_type: BlockProcessType,
) -> Result<(), SendErrorProcessor> {
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
let beacon_processor = self
.beacon_processor_if_enabled()
.ok_or(SendErrorProcessor::ProcessorNotAvailable)?;