mirror of
https://github.com/sigp/lighthouse.git
synced 2026-03-10 04:01:51 +00:00
Instrument tracing spans for block processing and import (#7816)
#7815 - removes all existing spans, so some span fields that appear in logs like `service_name` may be lost. - instruments a few key code paths in the beacon node, starting from **root spans** named below: * Gossip block and blobs * `process_gossip_data_column_sidecar` * `process_gossip_blob` * `process_gossip_block` * Rpc block and blobs * `process_rpc_block` * `process_rpc_blobs` * `process_rpc_custody_columns` * Rpc blocks (range and backfill) * `process_chain_segment` * `PendingComponents` lifecycle * `pending_components` To test locally: * Run Grafana and Tempo with https://github.com/sigp/lighthouse-metrics/pull/57 * Run Lighthouse BN with `--telemetry-collector-url http://localhost:4317` Some captured traces can be found here: https://hackmd.io/@jimmygchen/r1sLOxPPeg Removing the old spans seem to have reduced the memory usage quite a lot - i think we were using them on long running tasks and too excessively: <img width="910" height="495" alt="image" src="https://github.com/user-attachments/assets/5208bbe4-53b2-4ead-bc71-0b782c788669" />
This commit is contained in:
@@ -27,7 +27,7 @@ use std::collections::{
|
||||
HashSet,
|
||||
};
|
||||
use std::sync::Arc;
|
||||
use tracing::{debug, error, info, instrument, warn};
|
||||
use tracing::{debug, error, info, warn};
|
||||
use types::{Epoch, EthSpec};
|
||||
|
||||
/// Blocks are downloaded in batches from peers. This constant specifies how many epochs worth of
|
||||
@@ -147,10 +147,6 @@ pub struct BackFillSync<T: BeaconChainTypes> {
|
||||
}
|
||||
|
||||
impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
#[instrument(parent = None,
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn new(
|
||||
beacon_chain: Arc<BeaconChain<T>>,
|
||||
network_globals: Arc<NetworkGlobals<T::EthSpec>>,
|
||||
@@ -191,11 +187,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
}
|
||||
|
||||
/// Pauses the backfill sync if it's currently syncing.
|
||||
#[instrument(parent = None,
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn pause(&mut self) {
|
||||
if let BackFillState::Syncing = self.state() {
|
||||
debug!(processed_epochs = %self.validated_batches, to_be_processed = %self.current_start,"Backfill sync paused");
|
||||
@@ -207,11 +198,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
///
|
||||
/// If resuming is successful, reports back the current syncing metrics.
|
||||
#[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"]
|
||||
#[instrument(parent = None,
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn start(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -287,11 +273,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
/// A fully synced peer has joined us.
|
||||
/// If we are in a failed state, update a local variable to indicate we are able to restart
|
||||
/// the failed sync on the next attempt.
|
||||
#[instrument(parent = None,
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn fully_synced_peer_joined(&mut self) {
|
||||
if matches!(self.state(), BackFillState::Failed) {
|
||||
self.restart_failed_sync = true;
|
||||
@@ -300,11 +281,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
|
||||
/// A peer has disconnected.
|
||||
/// If the peer has active batches, those are considered failed and re-requested.
|
||||
#[instrument(parent = None,
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
#[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"]
|
||||
pub fn peer_disconnected(&mut self, peer_id: &PeerId) -> Result<(), BackFillError> {
|
||||
if matches!(self.state(), BackFillState::Failed) {
|
||||
@@ -319,11 +295,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
/// An RPC error has occurred.
|
||||
///
|
||||
/// If the batch exists it is re-requested.
|
||||
#[instrument(parent = None,
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
#[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"]
|
||||
pub fn inject_error(
|
||||
&mut self,
|
||||
@@ -361,11 +332,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
/// If this returns an error, the backfill sync has failed and will be restarted once new peers
|
||||
/// join the system.
|
||||
/// The sync manager should update the global sync state on failure.
|
||||
#[instrument(parent = None,
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
#[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"]
|
||||
pub fn on_block_response(
|
||||
&mut self,
|
||||
@@ -417,11 +383,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
/// The syncing process has failed.
|
||||
///
|
||||
/// This resets past variables, to allow for a fresh start when resuming.
|
||||
#[instrument(parent = None,
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn fail_sync(&mut self, error: BackFillError) -> Result<(), BackFillError> {
|
||||
// Some errors shouldn't fail the chain.
|
||||
if matches!(error, BackFillError::Paused) {
|
||||
@@ -453,11 +414,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
|
||||
/// Processes the batch with the given id.
|
||||
/// The batch must exist and be ready for processing
|
||||
#[instrument(parent = None,
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn process_batch(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -516,11 +472,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
/// The block processor has completed processing a batch. This function handles the result
|
||||
/// of the batch processor.
|
||||
/// If an error is returned the BackFill sync has failed.
|
||||
#[instrument(parent = None,
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
#[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"]
|
||||
pub fn on_batch_process_result(
|
||||
&mut self,
|
||||
@@ -673,11 +624,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
}
|
||||
|
||||
/// Processes the next ready batch.
|
||||
#[instrument(parent = None,
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn process_completed_batches(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -741,11 +687,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
///
|
||||
/// If a previous batch has been validated and it had been re-processed, penalize the original
|
||||
/// peer.
|
||||
#[instrument(parent = None,
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn advance_chain(&mut self, network: &mut SyncNetworkContext<T>, validating_epoch: Epoch) {
|
||||
// make sure this epoch produces an advancement
|
||||
if validating_epoch >= self.current_start {
|
||||
@@ -837,11 +778,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
/// These events occur when a peer has successfully responded with blocks, but the blocks we
|
||||
/// have received are incorrect or invalid. This indicates the peer has not performed as
|
||||
/// intended and can result in downvoting a peer.
|
||||
#[instrument(parent = None,
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn handle_invalid_batch(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -893,11 +829,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
}
|
||||
|
||||
/// Requests the batch assigned to the given id from a given peer.
|
||||
#[instrument(parent = None,
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn send_batch(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -969,11 +900,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
|
||||
/// When resuming a chain, this function searches for batches that need to be re-downloaded and
|
||||
/// transitions their state to redownload the batch.
|
||||
#[instrument(parent = None,
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn resume_batches(&mut self, network: &mut SyncNetworkContext<T>) -> Result<(), BackFillError> {
|
||||
let batch_ids_to_retry = self
|
||||
.batches
|
||||
@@ -998,11 +924,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
|
||||
/// Attempts to request the next required batches from the peer pool if the chain is syncing. It will exhaust the peer
|
||||
/// pool and left over batches until the batch buffer is reached or all peers are exhausted.
|
||||
#[instrument(parent = None,
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn request_batches(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -1027,11 +948,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
|
||||
/// Creates the next required batch from the chain. If there are no more batches required,
|
||||
/// `false` is returned.
|
||||
#[instrument(parent = None,
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn include_next_batch(&mut self, network: &mut SyncNetworkContext<T>) -> Option<BatchId> {
|
||||
// don't request batches beyond genesis;
|
||||
if self.last_batch_downloaded {
|
||||
@@ -1093,11 +1009,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
///
|
||||
/// This errors if the beacon chain indicates that backfill sync has already completed or is
|
||||
/// not required.
|
||||
#[instrument(parent = None,
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn reset_start_epoch(&mut self) -> Result<(), ResetEpochError> {
|
||||
let anchor_info = self.beacon_chain.store.get_anchor_info();
|
||||
if anchor_info.block_backfill_complete(self.beacon_chain.genesis_backfill_slot) {
|
||||
@@ -1111,11 +1022,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
}
|
||||
|
||||
/// Checks with the beacon chain if backfill sync has completed.
|
||||
#[instrument(parent = None,
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn check_completed(&mut self) -> bool {
|
||||
if self.would_complete(self.current_start) {
|
||||
// Check that the beacon chain agrees
|
||||
@@ -1131,11 +1037,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
}
|
||||
|
||||
/// Checks if backfill would complete by syncing to `start_epoch`.
|
||||
#[instrument(parent = None,
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn would_complete(&self, start_epoch: Epoch) -> bool {
|
||||
start_epoch
|
||||
<= self
|
||||
@@ -1145,20 +1046,10 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
}
|
||||
|
||||
/// Updates the global network state indicating the current state of a backfill sync.
|
||||
#[instrument(parent = None,
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn set_state(&self, state: BackFillState) {
|
||||
*self.network_globals.backfill_state.write() = state;
|
||||
}
|
||||
|
||||
#[instrument(parent = None,
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn state(&self) -> BackFillState {
|
||||
self.network_globals.backfill_state.read().clone()
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user