Integrate tracing (#6339)

Tracing Integration
- [reference](5bbf1859e9/projects/project-ideas.md (L297))


  - [x] replace slog & log with tracing throughout the codebase
- [x] implement custom crit log
- [x] make relevant changes in the formatter
- [x] replace sloggers
- [x] re-write SSE logging components

cc: @macladson @eserilev
This commit is contained in:
ThreeHrSleep
2025-03-13 04:01:05 +05:30
committed by GitHub
parent f23f984f85
commit d60c24ef1c
241 changed files with 9485 additions and 9328 deletions

View File

@@ -20,13 +20,14 @@ use beacon_chain::{BeaconChain, BeaconChainTypes};
use lighthouse_network::service::api_types::Id;
use lighthouse_network::types::{BackFillState, NetworkGlobals};
use lighthouse_network::{PeerAction, PeerId};
use logging::crit;
use rand::seq::SliceRandom;
use slog::{crit, debug, error, info, warn};
use std::collections::{
btree_map::{BTreeMap, Entry},
HashMap, HashSet,
};
use std::sync::Arc;
use tracing::{debug, error, info, instrument, warn};
use types::{Epoch, EthSpec};
/// Blocks are downloaded in batches from peers. This constant specifies how many epochs worth of
@@ -146,16 +147,17 @@ pub struct BackFillSync<T: BeaconChainTypes> {
/// Reference to the network globals in order to obtain valid peers to backfill blocks from
/// (i.e synced peers).
network_globals: Arc<NetworkGlobals<T::EthSpec>>,
/// A logger for backfill sync.
log: slog::Logger,
}
impl<T: BeaconChainTypes> BackFillSync<T> {
#[instrument(parent = None,
level = "info",
name = "backfill_sync",
skip_all
)]
pub fn new(
beacon_chain: Arc<BeaconChain<T>>,
network_globals: Arc<NetworkGlobals<T::EthSpec>>,
log: slog::Logger,
) -> Self {
// Determine if backfill is enabled or not.
// If, for some reason a backfill has already been completed (or we've used a trusted
@@ -186,7 +188,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
participating_peers: HashSet::new(),
restart_failed_sync: false,
beacon_chain,
log,
};
// Update the global network state with the current backfill state.
@@ -195,9 +196,15 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
}
/// Pauses the backfill sync if it's currently syncing.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
pub fn pause(&mut self) {
if let BackFillState::Syncing = self.state() {
debug!(self.log, "Backfill sync paused"; "processed_epochs" => self.validated_batches, "to_be_processed" => self.current_start);
debug!(processed_epochs = %self.validated_batches, to_be_processed = %self.current_start,"Backfill sync paused");
self.set_state(BackFillState::Paused);
}
}
@@ -206,6 +213,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
///
/// If resuming is successful, reports back the current syncing metrics.
#[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"]
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
pub fn start(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -222,7 +235,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
.is_some()
{
// If there are peers to resume with, begin the resume.
debug!(self.log, "Resuming backfill sync"; "start_epoch" => self.current_start, "awaiting_batches" => self.batches.len(), "processing_target" => self.processing_target);
debug!(start_epoch = ?self.current_start, awaiting_batches = self.batches.len(), processing_target = ?self.processing_target, "Resuming backfill sync");
self.set_state(BackFillState::Syncing);
// Resume any previously failed batches.
self.resume_batches(network)?;
@@ -251,14 +264,14 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
// This infallible match exists to force us to update this code if a future
// refactor of `ResetEpochError` adds a variant.
let ResetEpochError::SyncCompleted = e;
error!(self.log, "Backfill sync completed whilst in failed status");
error!("Backfill sync completed whilst in failed status");
self.set_state(BackFillState::Completed);
return Err(BackFillError::InvalidSyncState(String::from(
"chain completed",
)));
}
debug!(self.log, "Resuming a failed backfill sync"; "start_epoch" => self.current_start);
debug!(start_epoch = %self.current_start, "Resuming a failed backfill sync");
// begin requesting blocks from the peer pool, until all peers are exhausted.
self.request_batches(network)?;
@@ -281,6 +294,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// A fully synced peer has joined us.
/// If we are in a failed state, update a local variable to indicate we are able to restart
/// the failed sync on the next attempt.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
pub fn fully_synced_peer_joined(&mut self) {
if matches!(self.state(), BackFillState::Failed) {
self.restart_failed_sync = true;
@@ -289,6 +308,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// A peer has disconnected.
/// If the peer has active batches, those are considered failed and re-requested.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
#[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"]
pub fn peer_disconnected(
&mut self,
@@ -318,15 +343,13 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
// short circuit early.
if self.retry_batch_download(network, id).is_err() {
debug!(
self.log,
"Batch could not be retried";
"batch_id" => id,
"error" => "no synced peers"
batch_id = %id,
error = "no synced peers",
"Batch could not be retried"
);
}
} else {
debug!(self.log, "Batch not found while removing peer";
"peer" => %peer_id, "batch" => id)
debug!(peer = %peer_id, batch = %id, "Batch not found while removing peer");
}
}
}
@@ -339,6 +362,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// An RPC error has occurred.
///
/// If the batch exists it is re-requested.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
#[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"]
pub fn inject_error(
&mut self,
@@ -356,7 +385,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
if !batch.is_expecting_block(&request_id) {
return Ok(());
}
debug!(self.log, "Batch failed"; "batch_epoch" => batch_id, "error" => "rpc_error");
debug!(batch_epoch = %batch_id, error = "rpc_error", "Batch failed");
if let Some(active_requests) = self.active_requests.get_mut(peer_id) {
active_requests.remove(&batch_id);
}
@@ -378,6 +407,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// If this returns an error, the backfill sync has failed and will be restarted once new peers
/// join the system.
/// The sync manager should update the global sync state on failure.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
#[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"]
pub fn on_block_response(
&mut self,
@@ -391,7 +426,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
let Some(batch) = self.batches.get_mut(&batch_id) else {
if !matches!(self.state(), BackFillState::Failed) {
// A batch might get removed when the chain advances, so this is non fatal.
debug!(self.log, "Received a block for unknown batch"; "epoch" => batch_id);
debug!(epoch = %batch_id, "Received a block for unknown batch");
}
return Ok(ProcessResult::Successful);
};
@@ -416,7 +451,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
Ok(received) => {
let awaiting_batches =
self.processing_target.saturating_sub(batch_id) / BACKFILL_EPOCHS_PER_BATCH;
debug!(self.log, "Completed batch received"; "epoch" => batch_id, "blocks" => received, "awaiting_batches" => awaiting_batches);
debug!(
epoch = %batch_id,
blocks = received,
%awaiting_batches,
"Completed batch received"
);
// pre-emptively request more blocks from peers whilst we process current blocks,
self.request_batches(network)?;
@@ -432,6 +472,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// The syncing process has failed.
///
/// This resets past variables, to allow for a fresh start when resuming.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn fail_sync(&mut self, error: BackFillError) -> Result<(), BackFillError> {
// Some errors shouldn't fail the chain.
if matches!(error, BackFillError::Paused) {
@@ -455,7 +501,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
// NOTE: Lets keep validated_batches for posterity
// Emit the log here
error!(self.log, "Backfill sync failed"; "error" => ?error);
error!(?error, "Backfill sync failed");
// Return the error, kinda weird pattern, but I want to use
// `self.fail_chain(_)?` in other parts of the code.
@@ -464,6 +510,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// Processes the batch with the given id.
/// The batch must exist and be ready for processing
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn process_batch(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -503,8 +555,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
.beacon_processor()
.send_chain_segment(process_id, blocks)
{
crit!(self.log, "Failed to send backfill segment to processor."; "msg" => "process_batch",
"error" => %e, "batch" => self.processing_target);
crit!(
msg = "process_batch",
error = %e,
batch = ?self.processing_target,
"Failed to send backfill segment to processor."
);
// This is unlikely to happen but it would stall syncing since the batch now has no
// blocks to continue, and the chain is expecting a processing result that won't
// arrive. To mitigate this, (fake) fail this processing so that the batch is
@@ -518,6 +574,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// The block processor has completed processing a batch. This function handles the result
/// of the batch processor.
/// If an error is returned the BackFill sync has failed.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
#[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"]
pub fn on_batch_process_result(
&mut self,
@@ -530,13 +592,15 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
// result
let batch = match &self.current_processing_batch {
Some(processing_id) if *processing_id != batch_id => {
debug!(self.log, "Unexpected batch result";
"batch_epoch" => batch_id, "expected_batch_epoch" => processing_id);
debug!(
batch_epoch = %batch_id.as_u64(),
expected_batch_epoch = processing_id.as_u64(),
"Unexpected batch result"
);
return Ok(ProcessResult::Successful);
}
None => {
debug!(self.log, "Chain was not expecting a batch result";
"batch_epoch" => batch_id);
debug!(%batch_id, "Chain was not expecting a batch result");
return Ok(ProcessResult::Successful);
}
_ => {
@@ -566,8 +630,14 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
return Ok(ProcessResult::Successful);
};
debug!(self.log, "Backfill batch processed"; "result" => ?result, &batch,
"batch_epoch" => batch_id, "peer" => %peer, "client" => %network.client_type(peer));
debug!(
?result,
%batch,
batch_epoch = %batch_id,
%peer,
client = %network.client_type(peer),
"Backfill batch processed"
);
match result {
BatchProcessResult::Success {
@@ -591,7 +661,10 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
// check if the chain has completed syncing
if self.check_completed() {
// chain is completed
info!(self.log, "Backfill sync completed"; "blocks_processed" => self.validated_batches * T::EthSpec::slots_per_epoch());
info!(
blocks_processed = self.validated_batches * T::EthSpec::slots_per_epoch(),
"Backfill sync completed"
);
self.set_state(BackFillState::Completed);
Ok(ProcessResult::SyncCompleted)
} else {
@@ -619,10 +692,9 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
// repeatedly and are either malicious or faulty. We stop the backfill sync and
// report all synced peers that have participated.
warn!(
self.log,
"Backfill batch failed to download. Penalizing peers";
"score_adjustment" => %penalty,
"batch_epoch"=> batch_id
score_adjustment = %penalty,
batch_epoch = %batch_id,
"Backfill batch failed to download. Penalizing peers"
);
for peer in self.participating_peers.drain() {
@@ -658,6 +730,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
}
/// Processes the next ready batch.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn process_completed_batches(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -692,7 +770,10 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
BatchState::AwaitingValidation(_) => {
// TODO: I don't think this state is possible, log a CRIT just in case.
// If this is not observed, add it to the failed state branch above.
crit!(self.log, "Chain encountered a robust batch awaiting validation"; "batch" => self.processing_target);
crit!(
batch = ?self.processing_target,
"Chain encountered a robust batch awaiting validation"
);
self.processing_target -= BACKFILL_EPOCHS_PER_BATCH;
if self.to_be_downloaded >= self.processing_target {
@@ -718,6 +799,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
///
/// If a previous batch has been validated and it had been re-processed, penalize the original
/// peer.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn advance_chain(&mut self, network: &mut SyncNetworkContext<T>, validating_epoch: Epoch) {
// make sure this epoch produces an advancement
if validating_epoch >= self.current_start {
@@ -745,9 +832,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
// A different peer sent the correct batch, the previous peer did not
// We negatively score the original peer.
let action = PeerAction::LowToleranceError;
debug!(self.log, "Re-processed batch validated. Scoring original peer";
"batch_epoch" => id, "score_adjustment" => %action,
"original_peer" => %attempt.peer_id, "new_peer" => %processed_attempt.peer_id
debug!(
batch_epoch = ?id,
score_adjustment = %action,
original_peer = %attempt.peer_id,
new_peer = %processed_attempt.peer_id,
"Re-processed batch validated. Scoring original peer"
);
network.report_peer(
attempt.peer_id,
@@ -758,9 +848,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
// The same peer corrected it's previous mistake. There was an error, so we
// negative score the original peer.
let action = PeerAction::MidToleranceError;
debug!(self.log, "Re-processed batch validated by the same peer";
"batch_epoch" => id, "score_adjustment" => %action,
"original_peer" => %attempt.peer_id, "new_peer" => %processed_attempt.peer_id
debug!(
batch_epoch = ?id,
score_adjustment = %action,
original_peer = %attempt.peer_id,
new_peer = %processed_attempt.peer_id,
"Re-processed batch validated by the same peer"
);
network.report_peer(
attempt.peer_id,
@@ -778,14 +871,11 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
}
}
BatchState::Failed | BatchState::Poisoned | BatchState::AwaitingDownload => {
crit!(
self.log,
"batch indicates inconsistent chain state while advancing chain"
)
crit!("batch indicates inconsistent chain state while advancing chain")
}
BatchState::AwaitingProcessing(..) => {}
BatchState::Processing(_) => {
debug!(self.log, "Advancing chain while processing a batch"; "batch" => id, batch);
debug!(batch = %id, %batch, "Advancing chain while processing a batch");
if let Some(processing_id) = self.current_processing_batch {
if id >= processing_id {
self.current_processing_batch = None;
@@ -803,7 +893,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
// won't have this batch, so we need to request it.
self.to_be_downloaded -= BACKFILL_EPOCHS_PER_BATCH;
}
debug!(self.log, "Backfill advanced"; "validated_epoch" => validating_epoch, "processing_target" => self.processing_target);
debug!(?validating_epoch, processing_target = ?self.processing_target, "Backfill advanced");
}
/// An invalid batch has been received that could not be processed, but that can be retried.
@@ -811,6 +901,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// These events occur when a peer has successfully responded with blocks, but the blocks we
/// have received are incorrect or invalid. This indicates the peer has not performed as
/// intended and can result in downvoting a peer.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn handle_invalid_batch(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -862,6 +958,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
}
/// Sends and registers the request of a batch awaiting download.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn retry_batch_download(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -896,13 +998,19 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
self.send_batch(network, batch_id, peer)
} else {
// If we are here the chain has no more synced peers
info!(self.log, "Backfill sync paused"; "reason" => "insufficient_synced_peers");
info!(reason = "insufficient_synced_peers", "Backfill sync paused");
self.set_state(BackFillState::Paused);
Err(BackFillError::Paused)
}
}
/// Requests the batch assigned to the given id from a given peer.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn send_batch(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -922,7 +1030,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
if let Err(e) = batch.start_downloading_from_peer(peer, request_id) {
return self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0));
}
debug!(self.log, "Requesting batch"; "epoch" => batch_id, &batch);
debug!(epoch = %batch_id, %batch, "Requesting batch");
// register the batch for this peer
self.active_requests
@@ -933,8 +1041,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
}
Err(e) => {
// NOTE: under normal conditions this shouldn't happen but we handle it anyway
warn!(self.log, "Could not send batch request";
"batch_id" => batch_id, "error" => ?e, &batch);
warn!(%batch_id, error = ?e, %batch,"Could not send batch request");
// register the failed download and check if the batch can be retried
if let Err(e) = batch.start_downloading_from_peer(peer, 1) {
return self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0));
@@ -963,6 +1070,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// When resuming a chain, this function searches for batches that need to be re-downloaded and
/// transitions their state to redownload the batch.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn resume_batches(&mut self, network: &mut SyncNetworkContext<T>) -> Result<(), BackFillError> {
let batch_ids_to_retry = self
.batches
@@ -987,6 +1100,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// Attempts to request the next required batches from the peer pool if the chain is syncing. It will exhaust the peer
/// pool and left over batches until the batch buffer is reached or all peers are exhausted.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn request_batches(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -1029,6 +1148,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// Creates the next required batch from the chain. If there are no more batches required,
/// `false` is returned.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn include_next_batch(&mut self, network: &mut SyncNetworkContext<T>) -> Option<BatchId> {
// don't request batches beyond genesis;
if self.last_batch_downloaded {
@@ -1090,6 +1215,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
///
/// This errors if the beacon chain indicates that backfill sync has already completed or is
/// not required.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn reset_start_epoch(&mut self) -> Result<(), ResetEpochError> {
let anchor_info = self.beacon_chain.store.get_anchor_info();
if anchor_info.block_backfill_complete(self.beacon_chain.genesis_backfill_slot) {
@@ -1103,6 +1234,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
}
/// Checks with the beacon chain if backfill sync has completed.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn check_completed(&mut self) -> bool {
if self.would_complete(self.current_start) {
// Check that the beacon chain agrees
@@ -1111,13 +1248,19 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
if anchor_info.block_backfill_complete(self.beacon_chain.genesis_backfill_slot) {
return true;
} else {
error!(self.log, "Backfill out of sync with beacon chain");
error!("Backfill out of sync with beacon chain");
}
}
false
}
/// Checks if backfill would complete by syncing to `start_epoch`.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn would_complete(&self, start_epoch: Epoch) -> bool {
start_epoch
<= self
@@ -1127,10 +1270,22 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
}
/// Updates the global network state indicating the current state of a backfill sync.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn set_state(&self, state: BackFillState) {
*self.network_globals.backfill_state.write() = state;
}
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn state(&self) -> BackFillState {
self.network_globals.backfill_state.read().clone()
}