Instrument tracing spans for block processing and import (#7816)

#7815

- removes all existing spans, so some span fields that appear in logs like `service_name` may be lost.
- instruments a few key code paths in the beacon node, starting from **root spans** named below:

* Gossip block and blobs
* `process_gossip_data_column_sidecar`
* `process_gossip_blob`
* `process_gossip_block`
* Rpc block and blobs
* `process_rpc_block`
* `process_rpc_blobs`
* `process_rpc_custody_columns`
* Rpc blocks (range and backfill)
* `process_chain_segment`
* `PendingComponents` lifecycle
* `pending_components`

To test locally:
* Run Grafana and Tempo with https://github.com/sigp/lighthouse-metrics/pull/57
* Run Lighthouse BN with `--telemetry-collector-url http://localhost:4317`

Some captured traces can be found here: https://hackmd.io/@jimmygchen/r1sLOxPPeg

Removing the old spans seem to have reduced the memory usage quite a lot - i think we were using them on long running tasks and too excessively:
<img width="910" height="495" alt="image" src="https://github.com/user-attachments/assets/5208bbe4-53b2-4ead-bc71-0b782c788669" />
This commit is contained in:
Jimmy Chen
2025-08-08 15:32:22 +10:00
committed by GitHub
parent 6dfab22267
commit 40c2fd5ff4
52 changed files with 633 additions and 1164 deletions

View File

@@ -21,7 +21,7 @@ use std::str::Utf8Error;
use std::sync::Arc;
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use store::AbstractExecPayload;
use tracing::{debug, error, info, instrument, warn};
use tracing::{debug, error, info, warn};
use types::consts::altair::{
TIMELY_HEAD_FLAG_INDEX, TIMELY_SOURCE_FLAG_INDEX, TIMELY_TARGET_FLAG_INDEX,
};
@@ -405,10 +405,6 @@ pub struct ValidatorMonitor<E: EthSpec> {
}
impl<E: EthSpec> ValidatorMonitor<E> {
#[instrument(parent = None,
name = "validator_monitor",
skip_all
)]
pub fn new(
config: ValidatorMonitorConfig,
beacon_proposer_cache: Arc<Mutex<BeaconProposerCache>>,
@@ -438,21 +434,11 @@ impl<E: EthSpec> ValidatorMonitor<E> {
/// Returns `true` when the validator count is sufficiently low enough to
/// emit metrics and logs on a per-validator basis (rather than just an
/// aggregated basis).
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
fn individual_tracking(&self) -> bool {
self.validators.len() <= self.individual_tracking_threshold
}
/// Add some validators to `self` for additional monitoring.
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
pub fn add_validator_pubkey(&mut self, pubkey: PublicKeyBytes) {
let index_opt = self
.indices
@@ -470,11 +456,6 @@ impl<E: EthSpec> ValidatorMonitor<E> {
}
/// Add an unaggregated attestation
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
pub fn set_unaggregated_attestation(&mut self, attestation: Attestation<E>) {
let unaggregated_attestations = &mut self.unaggregated_attestations;
@@ -488,22 +469,12 @@ impl<E: EthSpec> ValidatorMonitor<E> {
self.unaggregated_attestations.insert(slot, attestation);
}
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
pub fn get_unaggregated_attestation(&self, slot: Slot) -> Option<&Attestation<E>> {
self.unaggregated_attestations.get(&slot)
}
/// Reads information from the given `state`. The `state` *must* be valid (i.e, able to be
/// imported).
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
pub fn process_valid_state(
&mut self,
current_epoch: Epoch,
@@ -616,11 +587,6 @@ impl<E: EthSpec> ValidatorMonitor<E> {
}
/// Add missed non-finalized blocks for the monitored validators
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
fn add_validators_missed_blocks(&mut self, state: &BeaconState<E>) {
// Define range variables
let current_slot = state.slot();
@@ -717,11 +683,6 @@ impl<E: EthSpec> ValidatorMonitor<E> {
}
}
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
fn get_proposers_by_epoch_from_cache(
&mut self,
epoch: Epoch,
@@ -735,11 +696,6 @@ impl<E: EthSpec> ValidatorMonitor<E> {
/// Process the unaggregated attestations generated by the service `attestation_simulator_service`
/// and check if the attestation qualifies for a reward matching the flags source/target/head
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
fn process_unaggregated_attestations(&mut self, state: &BeaconState<E>, spec: &ChainSpec) {
let current_slot = state.slot();
@@ -812,11 +768,6 @@ impl<E: EthSpec> ValidatorMonitor<E> {
///
/// We allow disabling tracking metrics on an individual validator basis
/// since it can result in untenable cardinality with high validator counts.
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
fn aggregatable_metric<F: Fn(&str)>(&self, individual_id: &str, func: F) {
func(TOTAL_LABEL);
@@ -825,11 +776,6 @@ impl<E: EthSpec> ValidatorMonitor<E> {
}
}
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
pub fn process_validator_statuses(
&self,
epoch: Epoch,
@@ -1107,11 +1053,6 @@ impl<E: EthSpec> ValidatorMonitor<E> {
Ok(())
}
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
fn get_validator(&self, validator_index: u64) -> Option<&MonitoredValidator> {
self.indices
.get(&validator_index)
@@ -1119,30 +1060,15 @@ impl<E: EthSpec> ValidatorMonitor<E> {
}
/// Returns the number of validators monitored by `self`.
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
pub fn num_validators(&self) -> usize {
self.validators.len()
}
/// Return the `id`'s of all monitored validators.
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
pub fn get_all_monitored_validators(&self) -> Vec<String> {
self.validators.values().map(|val| val.id.clone()).collect()
}
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
pub fn get_monitored_validator(&self, index: u64) -> Option<&MonitoredValidator> {
if let Some(pubkey) = self.indices.get(&index) {
self.validators.get(pubkey)
@@ -1151,11 +1077,6 @@ impl<E: EthSpec> ValidatorMonitor<E> {
}
}
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
pub fn get_monitored_validator_missed_block_count(&self, validator_index: u64) -> u64 {
self.missed_blocks
.iter()
@@ -1163,22 +1084,12 @@ impl<E: EthSpec> ValidatorMonitor<E> {
.count() as u64
}
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
pub fn get_beacon_proposer_cache(&self) -> Arc<Mutex<BeaconProposerCache>> {
self.beacon_proposer_cache.clone()
}
/// If `self.auto_register == true`, add the `validator_index` to `self.monitored_validators`.
/// Otherwise, do nothing.
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
pub fn auto_register_local_validator(&mut self, validator_index: u64) {
if !self.auto_register {
return;
@@ -1201,11 +1112,6 @@ impl<E: EthSpec> ValidatorMonitor<E> {
}
/// Process a block received on gossip.
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
pub fn register_gossip_block<S: SlotClock>(
&self,
seen_timestamp: Duration,
@@ -1217,11 +1123,6 @@ impl<E: EthSpec> ValidatorMonitor<E> {
}
/// Process a block received on the HTTP API from a local validator.
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
pub fn register_api_block<S: SlotClock>(
&self,
seen_timestamp: Duration,
@@ -1232,11 +1133,6 @@ impl<E: EthSpec> ValidatorMonitor<E> {
self.register_beacon_block("api", seen_timestamp, block, block_root, slot_clock)
}
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
fn register_beacon_block<S: SlotClock>(
&self,
src: &str,
@@ -1276,11 +1172,6 @@ impl<E: EthSpec> ValidatorMonitor<E> {
}
/// Register an attestation seen on the gossip network.
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
pub fn register_gossip_unaggregated_attestation<S: SlotClock>(
&self,
seen_timestamp: Duration,
@@ -1296,11 +1187,6 @@ impl<E: EthSpec> ValidatorMonitor<E> {
}
/// Register an attestation seen on the HTTP API.
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
pub fn register_api_unaggregated_attestation<S: SlotClock>(
&self,
seen_timestamp: Duration,
@@ -1315,11 +1201,6 @@ impl<E: EthSpec> ValidatorMonitor<E> {
)
}
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
fn register_unaggregated_attestation<S: SlotClock>(
&self,
src: &str,
@@ -1406,11 +1287,6 @@ impl<E: EthSpec> ValidatorMonitor<E> {
)
}
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
fn register_aggregated_attestation<S: SlotClock>(
&self,
src: &str,
@@ -1529,10 +1405,6 @@ impl<E: EthSpec> ValidatorMonitor<E> {
/// We use the parent slot instead of block slot to ignore skip slots when calculating inclusion distance.
///
/// Note: Blocks that get orphaned will skew the inclusion distance calculation.
#[instrument(parent = None,
name = "validator_monitor",
skip_all
)]
pub fn register_attestation_in_block(
&self,
indexed_attestation: IndexedAttestationRef<'_, E>,
@@ -1608,11 +1480,6 @@ impl<E: EthSpec> ValidatorMonitor<E> {
}
/// Register a sync committee message received over gossip.
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
pub fn register_gossip_sync_committee_message<S: SlotClock>(
&self,
seen_timestamp: Duration,
@@ -1628,11 +1495,6 @@ impl<E: EthSpec> ValidatorMonitor<E> {
}
/// Register a sync committee message received over the http api.
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
pub fn register_api_sync_committee_message<S: SlotClock>(
&self,
seen_timestamp: Duration,
@@ -1648,11 +1510,6 @@ impl<E: EthSpec> ValidatorMonitor<E> {
}
/// Register a sync committee message.
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
fn register_sync_committee_message<S: SlotClock>(
&self,
src: &str,
@@ -1702,11 +1559,6 @@ impl<E: EthSpec> ValidatorMonitor<E> {
}
/// Register a sync committee contribution received over gossip.
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
pub fn register_gossip_sync_committee_contribution<S: SlotClock>(
&self,
seen_timestamp: Duration,
@@ -1724,11 +1576,6 @@ impl<E: EthSpec> ValidatorMonitor<E> {
}
/// Register a sync committee contribution received over the http api.
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
pub fn register_api_sync_committee_contribution<S: SlotClock>(
&self,
seen_timestamp: Duration,
@@ -1746,11 +1593,6 @@ impl<E: EthSpec> ValidatorMonitor<E> {
}
/// Register a sync committee contribution.
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
fn register_sync_committee_contribution<S: SlotClock>(
&self,
src: &str,
@@ -1833,11 +1675,6 @@ impl<E: EthSpec> ValidatorMonitor<E> {
}
/// Register that the `sync_aggregate` was included in a *valid* `BeaconBlock`.
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
pub fn register_sync_aggregate_in_block(
&self,
slot: Slot,
@@ -1875,40 +1712,20 @@ impl<E: EthSpec> ValidatorMonitor<E> {
}
/// Register an exit from the gossip network.
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
pub fn register_gossip_voluntary_exit(&self, exit: &VoluntaryExit) {
self.register_voluntary_exit("gossip", exit)
}
/// Register an exit from the HTTP API.
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
pub fn register_api_voluntary_exit(&self, exit: &VoluntaryExit) {
self.register_voluntary_exit("api", exit)
}
/// Register an exit included in a *valid* beacon block.
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
pub fn register_block_voluntary_exit(&self, exit: &VoluntaryExit) {
self.register_voluntary_exit("block", exit)
}
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
fn register_voluntary_exit(&self, src: &str, exit: &VoluntaryExit) {
if let Some(validator) = self.get_validator(exit.validator_index) {
let id = &validator.id;
@@ -1932,40 +1749,20 @@ impl<E: EthSpec> ValidatorMonitor<E> {
}
/// Register a proposer slashing from the gossip network.
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
pub fn register_gossip_proposer_slashing(&self, slashing: &ProposerSlashing) {
self.register_proposer_slashing("gossip", slashing)
}
/// Register a proposer slashing from the HTTP API.
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
pub fn register_api_proposer_slashing(&self, slashing: &ProposerSlashing) {
self.register_proposer_slashing("api", slashing)
}
/// Register a proposer slashing included in a *valid* `BeaconBlock`.
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
pub fn register_block_proposer_slashing(&self, slashing: &ProposerSlashing) {
self.register_proposer_slashing("block", slashing)
}
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
fn register_proposer_slashing(&self, src: &str, slashing: &ProposerSlashing) {
let proposer = slashing.signed_header_1.message.proposer_index;
let slot = slashing.signed_header_1.message.slot;
@@ -1999,40 +1796,20 @@ impl<E: EthSpec> ValidatorMonitor<E> {
}
/// Register an attester slashing from the gossip network.
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
pub fn register_gossip_attester_slashing(&self, slashing: AttesterSlashingRef<'_, E>) {
self.register_attester_slashing("gossip", slashing)
}
/// Register an attester slashing from the HTTP API.
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
pub fn register_api_attester_slashing(&self, slashing: AttesterSlashingRef<'_, E>) {
self.register_attester_slashing("api", slashing)
}
/// Register an attester slashing included in a *valid* `BeaconBlock`.
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
pub fn register_block_attester_slashing(&self, slashing: AttesterSlashingRef<'_, E>) {
self.register_attester_slashing("block", slashing)
}
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
fn register_attester_slashing(&self, src: &str, slashing: AttesterSlashingRef<'_, E>) {
let data = slashing.attestation_1().data();
let attestation_1_indices: HashSet<u64> = slashing
@@ -2074,11 +1851,6 @@ impl<E: EthSpec> ValidatorMonitor<E> {
/// Scrape `self` for metrics.
///
/// Should be called whenever Prometheus is scraping Lighthouse.
#[instrument(parent = None,
fields(service = "validator_monitor"),
name = "validator_monitor",
skip_all
)]
pub fn scrape_metrics<S: SlotClock>(&self, slot_clock: &S, spec: &ChainSpec) {
metrics::set_gauge(
&metrics::VALIDATOR_MONITOR_VALIDATORS_TOTAL,