mirror of
https://github.com/sigp/lighthouse.git
synced 2026-03-15 19:02:42 +00:00
Instrument tracing spans for block processing and import (#7816)
#7815 - removes all existing spans, so some span fields that appear in logs like `service_name` may be lost. - instruments a few key code paths in the beacon node, starting from **root spans** named below: * Gossip block and blobs * `process_gossip_data_column_sidecar` * `process_gossip_blob` * `process_gossip_block` * Rpc block and blobs * `process_rpc_block` * `process_rpc_blobs` * `process_rpc_custody_columns` * Rpc blocks (range and backfill) * `process_chain_segment` * `PendingComponents` lifecycle * `pending_components` To test locally: * Run Grafana and Tempo with https://github.com/sigp/lighthouse-metrics/pull/57 * Run Lighthouse BN with `--telemetry-collector-url http://localhost:4317` Some captured traces can be found here: https://hackmd.io/@jimmygchen/r1sLOxPPeg Removing the old spans seem to have reduced the memory usage quite a lot - i think we were using them on long running tasks and too excessively: <img width="910" height="495" alt="image" src="https://github.com/user-attachments/assets/5208bbe4-53b2-4ead-bc71-0b782c788669" />
This commit is contained in:
@@ -12,7 +12,7 @@ use lighthouse_network::{PeerAction, PeerId};
|
||||
use logging::crit;
|
||||
use std::collections::{btree_map::Entry, BTreeMap, HashSet};
|
||||
use strum::IntoStaticStr;
|
||||
use tracing::{debug, instrument, warn};
|
||||
use tracing::{debug, warn};
|
||||
use types::{ColumnIndex, Epoch, EthSpec, Hash256, Slot};
|
||||
|
||||
/// Blocks are downloaded in batches from peers. This constant specifies how many epochs worth of
|
||||
@@ -205,7 +205,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
|
||||
/// A block has been received for a batch on this chain.
|
||||
/// If the block correctly completes the batch it will be processed if possible.
|
||||
#[instrument(parent = None, fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
pub fn on_block_response(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -252,7 +251,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
|
||||
/// Processes the batch with the given id.
|
||||
/// The batch must exist and be ready for processing
|
||||
#[instrument(parent = None, fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
fn process_batch(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -300,7 +298,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
}
|
||||
|
||||
/// Processes the next ready batch, prioritizing optimistic batches over the processing target.
|
||||
#[instrument(parent = None, fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
fn process_completed_batches(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -410,7 +407,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
|
||||
/// The block processor has completed processing a batch. This function handles the result
|
||||
/// of the batch processor.
|
||||
#[instrument(parent = None, fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
pub fn on_batch_process_result(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -565,7 +561,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(parent = None, fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
fn reject_optimistic_batch(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -600,7 +595,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
/// If a previous batch has been validated and it had been re-processed, penalize the original
|
||||
/// peer.
|
||||
#[allow(clippy::modulo_one)]
|
||||
#[instrument(parent = None, fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
fn advance_chain(&mut self, network: &mut SyncNetworkContext<T>, validating_epoch: Epoch) {
|
||||
// make sure this epoch produces an advancement
|
||||
if validating_epoch <= self.start_epoch {
|
||||
@@ -704,7 +698,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
/// These events occur when a peer has successfully responded with blocks, but the blocks we
|
||||
/// have received are incorrect or invalid. This indicates the peer has not performed as
|
||||
/// intended and can result in downvoting a peer.
|
||||
#[instrument(parent = None, fields(service = self.id, network), skip_all)]
|
||||
fn handle_invalid_batch(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -764,7 +757,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
/// This chain has been requested to start syncing.
|
||||
///
|
||||
/// This could be new chain, or an old chain that is being resumed.
|
||||
#[instrument(parent = None, fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
pub fn start_syncing(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -803,7 +795,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
/// Add a peer to the chain.
|
||||
///
|
||||
/// If the chain is active, this starts requesting batches from this peer.
|
||||
#[instrument(parent = None, fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
pub fn add_peer(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -816,7 +807,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
/// An RPC error has occurred.
|
||||
///
|
||||
/// If the batch exists it is re-requested.
|
||||
#[instrument(parent = None, fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
pub fn inject_error(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -900,7 +890,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
}
|
||||
|
||||
/// Requests the batch assigned to the given id from a given peer.
|
||||
#[instrument(parent = None, fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
pub fn send_batch(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -979,7 +968,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
}
|
||||
|
||||
/// Retries partial column requests within the batch by creating new requests for the failed columns.
|
||||
#[instrument(parent = None, fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
pub fn retry_partial_batch(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -1032,7 +1020,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
|
||||
/// Kickstarts the chain by sending for processing batches that are ready and requesting more
|
||||
/// batches if needed.
|
||||
#[instrument(parent = None, fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
pub fn resume(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -1045,7 +1032,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
|
||||
/// Attempts to request the next required batches from the peer pool if the chain is syncing. It will exhaust the peer
|
||||
/// pool and left over batches until the batch buffer is reached or all peers are exhausted.
|
||||
#[instrument(parent = None, fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
fn request_batches(&mut self, network: &mut SyncNetworkContext<T>) -> ProcessingResult {
|
||||
if !matches!(self.state, ChainSyncingState::Syncing) {
|
||||
return Ok(KeepChain);
|
||||
@@ -1114,7 +1100,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
|
||||
/// Creates the next required batch from the chain. If there are no more batches required,
|
||||
/// `false` is returned.
|
||||
#[instrument(parent = None, fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
fn include_next_batch(&mut self, network: &mut SyncNetworkContext<T>) -> Option<BatchId> {
|
||||
// don't request batches beyond the target head slot
|
||||
if self
|
||||
|
||||
@@ -55,7 +55,7 @@ use logging::crit;
|
||||
use lru_cache::LRUTimeCache;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use tracing::{debug, instrument, trace, warn};
|
||||
use tracing::{debug, trace, warn};
|
||||
use types::{Epoch, EthSpec, Hash256};
|
||||
|
||||
/// For how long we store failed finalized chains to prevent retries.
|
||||
@@ -81,11 +81,6 @@ impl<T: BeaconChainTypes> RangeSync<T>
|
||||
where
|
||||
T: BeaconChainTypes,
|
||||
{
|
||||
#[instrument(parent = None,
|
||||
fields(component = "range_sync"),
|
||||
name = "range_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn new(beacon_chain: Arc<BeaconChain<T>>) -> Self {
|
||||
RangeSync {
|
||||
beacon_chain: beacon_chain.clone(),
|
||||
@@ -102,11 +97,6 @@ where
|
||||
self.failed_chains.keys().copied().collect()
|
||||
}
|
||||
|
||||
#[instrument(parent = None,
|
||||
fields(component = "range_sync"),
|
||||
name = "range_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn state(&self) -> SyncChainStatus {
|
||||
self.chains.state()
|
||||
}
|
||||
@@ -116,11 +106,6 @@ where
|
||||
/// may need to be synced as a result. A new peer, may increase the peer pool of a finalized
|
||||
/// chain, this may result in a different finalized chain from syncing as finalized chains are
|
||||
/// prioritised by peer-pool size.
|
||||
#[instrument(parent = None,
|
||||
fields(component = "range_sync"),
|
||||
name = "range_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn add_peer(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -215,11 +200,6 @@ where
|
||||
///
|
||||
/// This function finds the chain that made this request. Once found, processes the result.
|
||||
/// This request could complete a chain or simply add to its progress.
|
||||
#[instrument(parent = None,
|
||||
fields(component = "range_sync"),
|
||||
name = "range_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn blocks_by_range_response(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -250,11 +230,6 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(parent = None,
|
||||
fields(component = "range_sync"),
|
||||
name = "range_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn handle_block_process_result(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -287,11 +262,6 @@ where
|
||||
|
||||
/// A peer has disconnected. This removes the peer from any ongoing chains and mappings. A
|
||||
/// disconnected peer could remove a chain
|
||||
#[instrument(parent = None,
|
||||
fields(component = "range_sync"),
|
||||
name = "range_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn peer_disconnect(&mut self, network: &mut SyncNetworkContext<T>, peer_id: &PeerId) {
|
||||
// if the peer is in the awaiting head mapping, remove it
|
||||
self.awaiting_head_peers.remove(peer_id);
|
||||
@@ -304,11 +274,6 @@ where
|
||||
/// which pool the peer is in. The chain may also have a batch or batches awaiting
|
||||
/// for this peer. If so we mark the batch as failed. The batch may then hit it's maximum
|
||||
/// retries. In this case, we need to remove the chain.
|
||||
#[instrument(parent = None,
|
||||
fields(component = "range_sync"),
|
||||
name = "range_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn remove_peer(&mut self, network: &mut SyncNetworkContext<T>, peer_id: &PeerId) {
|
||||
for (removed_chain, sync_type, remove_reason) in
|
||||
self.chains.call_all(|chain| chain.remove_peer(peer_id))
|
||||
@@ -327,11 +292,6 @@ where
|
||||
///
|
||||
/// Check to see if the request corresponds to a pending batch. If so, re-request it if possible, if there have
|
||||
/// been too many failed attempts for the batch, remove the chain.
|
||||
#[instrument(parent = None,
|
||||
fields(component = "range_sync"),
|
||||
name = "range_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn inject_error(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -362,11 +322,6 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(parent = None,
|
||||
fields(component = "range_sync"),
|
||||
name = "range_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn on_chain_removed(
|
||||
&mut self,
|
||||
chain: SyncingChain<T>,
|
||||
@@ -415,11 +370,6 @@ where
|
||||
}
|
||||
|
||||
/// Kickstarts sync.
|
||||
#[instrument(parent = None,
|
||||
fields(component = "range_sync"),
|
||||
name = "range_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn resume(&mut self, network: &mut SyncNetworkContext<T>) {
|
||||
for (removed_chain, sync_type, remove_reason) in
|
||||
self.chains.call_all(|chain| chain.resume(network))
|
||||
|
||||
Reference in New Issue
Block a user