Integrate tracing (#6339)

Tracing Integration
- [reference](5bbf1859e9/projects/project-ideas.md (L297))


  - [x] replace slog & log with tracing throughout the codebase
- [x] implement custom crit log
- [x] make relevant changes in the formatter
- [x] replace sloggers
- [x] re-write SSE logging components

cc: @macladson @eserilev
This commit is contained in:
ThreeHrSleep
2025-03-13 04:01:05 +05:30
committed by GitHub
parent f23f984f85
commit d60c24ef1c
241 changed files with 9485 additions and 9328 deletions

View File

@@ -3,6 +3,7 @@ use lighthouse_network::rpc::methods::BlocksByRangeRequest;
use lighthouse_network::service::api_types::Id;
use lighthouse_network::PeerId;
use std::collections::HashSet;
use std::fmt;
use std::hash::{Hash, Hasher};
use std::ops::Sub;
use std::time::{Duration, Instant};
@@ -61,6 +62,7 @@ pub trait BatchConfig {
fn batch_attempt_hash<E: EthSpec>(blocks: &[RpcBlock<E>]) -> u64;
}
#[derive(Debug)]
pub struct RangeSyncBatchConfig {}
impl BatchConfig for RangeSyncBatchConfig {
@@ -93,6 +95,7 @@ pub enum BatchProcessingResult {
NonFaultyFailure,
}
#[derive(Debug)]
/// A segment of a chain.
pub struct BatchInfo<E: EthSpec, B: BatchConfig = RangeSyncBatchConfig> {
/// Start slot of the batch.
@@ -113,6 +116,17 @@ pub struct BatchInfo<E: EthSpec, B: BatchConfig = RangeSyncBatchConfig> {
marker: std::marker::PhantomData<B>,
}
impl<E: EthSpec, B: BatchConfig> fmt::Display for BatchInfo<E, B> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"Start Slot: {}, End Slot: {}, State: {}",
self.start_slot, self.end_slot, self.state
)
}
}
#[derive(Display)]
/// Current state of a batch
pub enum BatchState<E: EthSpec> {
/// The batch has failed either downloading or processing, but can be requested again.
@@ -190,15 +204,6 @@ impl<E: EthSpec, B: BatchConfig> BatchInfo<E, B> {
peers
}
/// Return the number of times this batch has failed downloading and failed processing, in this
/// order.
pub fn failed_attempts(&self) -> (usize, usize) {
(
self.failed_download_attempts.len(),
self.failed_processing_attempts.len(),
)
}
/// Verifies if an incoming block belongs to this batch.
pub fn is_expecting_block(&self, request_id: &Id) -> bool {
if let BatchState::Downloading(_, expected_id) = &self.state {
@@ -456,39 +461,6 @@ impl Attempt {
}
}
impl<E: EthSpec, B: BatchConfig> slog::KV for &mut BatchInfo<E, B> {
fn serialize(
&self,
record: &slog::Record,
serializer: &mut dyn slog::Serializer,
) -> slog::Result {
slog::KV::serialize(*self, record, serializer)
}
}
impl<E: EthSpec, B: BatchConfig> slog::KV for BatchInfo<E, B> {
fn serialize(
&self,
record: &slog::Record,
serializer: &mut dyn slog::Serializer,
) -> slog::Result {
use slog::Value;
Value::serialize(&self.start_slot, record, "start_slot", serializer)?;
Value::serialize(
&(self.end_slot - 1), // NOTE: The -1 shows inclusive blocks
record,
"end_slot",
serializer,
)?;
serializer.emit_usize("downloaded", self.failed_download_attempts.len())?;
serializer.emit_usize("processed", self.failed_processing_attempts.len())?;
serializer.emit_u8("processed_no_penalty", self.non_faulty_processing_attempts)?;
serializer.emit_arguments("state", &format_args!("{:?}", self.state))?;
serializer.emit_arguments("batch_ty", &format_args!("{}", self.batch_type))?;
slog::Result::Ok(())
}
}
impl<E: EthSpec> std::fmt::Debug for BatchState<E> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {

View File

@@ -9,11 +9,13 @@ use beacon_chain::BeaconChainTypes;
use fnv::FnvHashMap;
use lighthouse_network::service::api_types::Id;
use lighthouse_network::{PeerAction, PeerId};
use logging::crit;
use rand::seq::SliceRandom;
use rand::Rng;
use slog::{crit, debug, o, warn};
use std::collections::{btree_map::Entry, BTreeMap, HashSet};
use std::fmt;
use strum::IntoStaticStr;
use tracing::{debug, instrument, warn};
use types::{Epoch, EthSpec, Hash256, Slot};
/// Blocks are downloaded in batches from peers. This constant specifies how many epochs worth of
@@ -37,6 +39,7 @@ pub type ProcessingResult = Result<KeepChain, RemoveChain>;
/// Reasons for removing a chain
#[derive(Debug)]
#[allow(dead_code)]
pub enum RemoveChain {
EmptyPeerPool,
ChainCompleted,
@@ -66,6 +69,7 @@ pub enum SyncingChainType {
/// A chain of blocks that need to be downloaded. Peers who claim to contain the target head
/// root are grouped into the peer pool and queried for batches when downloading the
/// chain.
#[derive(Debug)]
pub struct SyncingChain<T: BeaconChainTypes> {
/// A random id used to identify this chain.
id: ChainId,
@@ -110,9 +114,16 @@ pub struct SyncingChain<T: BeaconChainTypes> {
/// The current processing batch, if any.
current_processing_batch: Option<BatchId>,
}
/// The chain's log.
log: slog::Logger,
impl<T: BeaconChainTypes> fmt::Display for SyncingChain<T> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self.chain_type {
SyncingChainType::Head => write!(f, "Head"),
SyncingChainType::Finalized => write!(f, "Finalized"),
SyncingChainType::Backfill => write!(f, "Backfill"),
}
}
}
#[derive(PartialEq, Debug)]
@@ -132,7 +143,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
target_head_root: Hash256,
peer_id: PeerId,
chain_type: SyncingChainType,
log: &slog::Logger,
) -> Self {
let mut peers = FnvHashMap::default();
peers.insert(peer_id, Default::default());
@@ -151,7 +161,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
attempted_optimistic_starts: HashSet::default(),
state: ChainSyncingState::Stopped,
current_processing_batch: None,
log: log.new(o!("chain" => id)),
}
}
@@ -161,21 +170,25 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
}
/// Check if the chain has peers from which to process batches.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn available_peers(&self) -> usize {
self.peers.len()
}
/// Get the chain's id.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn get_id(&self) -> ChainId {
self.id
}
/// Peers currently syncing this chain.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn peers(&self) -> impl Iterator<Item = PeerId> + '_ {
self.peers.keys().cloned()
}
/// Progress in epochs made by the chain
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn processed_epochs(&self) -> u64 {
self.processing_target
.saturating_sub(self.start_epoch)
@@ -183,6 +196,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
}
/// Returns the total count of pending blocks in all the batches of this chain
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn pending_blocks(&self) -> usize {
self.batches
.values()
@@ -192,6 +206,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
/// Removes a peer from the chain.
/// If the peer has active batches, those are considered failed and re-requested.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn remove_peer(
&mut self,
peer_id: &PeerId,
@@ -211,8 +226,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
}
self.retry_batch_download(network, id)?;
} else {
debug!(self.log, "Batch not found while removing peer";
"peer" => %peer_id, "batch" => id)
debug!(%peer_id, batch = ?id, "Batch not found while removing peer")
}
}
}
@@ -225,6 +239,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
}
/// Returns the latest slot number that has been processed.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
fn current_processed_slot(&self) -> Slot {
// the last slot we processed was included in the previous batch, and corresponds to the
// first slot of the current target epoch
@@ -234,6 +249,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
/// A block has been received for a batch on this chain.
/// If the block correctly completes the batch it will be processed if possible.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn on_block_response(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -245,7 +261,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// check if we have this batch
let batch = match self.batches.get_mut(&batch_id) {
None => {
debug!(self.log, "Received a block for unknown batch"; "epoch" => batch_id);
debug!(epoch = %batch_id, "Received a block for unknown batch");
// A batch might get removed when the chain advances, so this is non fatal.
return Ok(KeepChain);
}
@@ -273,7 +289,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
let awaiting_batches = batch_id
.saturating_sub(self.optimistic_start.unwrap_or(self.processing_target))
/ EPOCHS_PER_BATCH;
debug!(self.log, "Batch downloaded"; "epoch" => batch_id, "blocks" => received, "batch_state" => self.visualize_batch_state(), "awaiting_batches" => awaiting_batches);
debug!(epoch = %batch_id, blocks = received, batch_state = self.visualize_batch_state(), %awaiting_batches,"Batch downloaded");
// pre-emptively request more blocks from peers whilst we process current blocks,
self.request_batches(network)?;
@@ -282,6 +298,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
/// Processes the batch with the given id.
/// The batch must exist and be ready for processing
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
fn process_batch(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -317,8 +334,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
self.current_processing_batch = Some(batch_id);
if let Err(e) = beacon_processor.send_chain_segment(process_id, blocks) {
crit!(self.log, "Failed to send chain segment to processor."; "msg" => "process_batch",
"error" => %e, "batch" => self.processing_target);
crit!(msg = "process_batch",error = %e, batch = ?self.processing_target, "Failed to send chain segment to processor.");
// This is unlikely to happen but it would stall syncing since the batch now has no
// blocks to continue, and the chain is expecting a processing result that won't
// arrive. To mitigate this, (fake) fail this processing so that the batch is
@@ -330,6 +346,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
}
/// Processes the next ready batch, prioritizing optimistic batches over the processing target.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
fn process_completed_batches(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -349,7 +366,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
match state {
BatchState::AwaitingProcessing(..) => {
// this batch is ready
debug!(self.log, "Processing optimistic start"; "epoch" => epoch);
debug!(%epoch, "Processing optimistic start");
return self.process_batch(network, epoch);
}
BatchState::Downloading(..) => {
@@ -377,7 +394,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// batch has been requested and processed we can land here. We drop the
// optimistic candidate since we can't conclude whether the batch included
// blocks or not at this point
debug!(self.log, "Dropping optimistic candidate"; "batch" => epoch);
debug!(batch = %epoch, "Dropping optimistic candidate");
self.optimistic_start = None;
}
}
@@ -411,7 +428,10 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// inside the download buffer (between `self.processing_target` and
// `self.to_be_downloaded`). In this case, eventually the chain advances to the
// batch (`self.processing_target` reaches this point).
debug!(self.log, "Chain encountered a robust batch awaiting validation"; "batch" => self.processing_target);
debug!(
batch = %self.processing_target,
"Chain encountered a robust batch awaiting validation"
);
self.processing_target += EPOCHS_PER_BATCH;
if self.to_be_downloaded <= self.processing_target {
@@ -436,6 +456,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
/// The block processor has completed processing a batch. This function handles the result
/// of the batch processor.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn on_batch_process_result(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -447,13 +468,11 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
let batch_state = self.visualize_batch_state();
let batch = match &self.current_processing_batch {
Some(processing_id) if *processing_id != batch_id => {
debug!(self.log, "Unexpected batch result";
"batch_epoch" => batch_id, "expected_batch_epoch" => processing_id);
debug!(batch_epoch = %batch_id, expected_batch_epoch = %processing_id,"Unexpected batch result");
return Ok(KeepChain);
}
None => {
debug!(self.log, "Chain was not expecting a batch result";
"batch_epoch" => batch_id);
debug!(batch_epoch = %batch_id,"Chain was not expecting a batch result");
return Ok(KeepChain);
}
_ => {
@@ -476,8 +495,14 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
})?;
// Log the process result and the batch for debugging purposes.
debug!(self.log, "Batch processing result"; "result" => ?result, &batch,
"batch_epoch" => batch_id, "client" => %network.client_type(&peer), "batch_state" => batch_state);
debug!(
result = ?result,
batch_epoch = %batch_id,
client = %network.client_type(&peer),
batch_state = ?batch_state,
?batch,
"Batch processing result"
);
// We consider three cases. Batch was successfully processed, Batch failed processing due
// to a faulty peer, or batch failed processing but the peer can't be deemed faulty.
@@ -563,10 +588,9 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// There are some edge cases with forks that could land us in this situation.
// This should be unlikely, so we tolerate these errors, but not often.
warn!(
self.log,
"Batch failed to download. Dropping chain scoring peers";
"score_adjustment" => %penalty,
"batch_epoch"=> batch_id,
score_adjustment = %penalty,
batch_epoch = %batch_id,
"Batch failed to download. Dropping chain scoring peers"
);
for (peer, _) in self.peers.drain() {
@@ -587,6 +611,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
}
}
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
fn reject_optimistic_batch(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -599,13 +624,13 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// it. NOTE: this is done to prevent non-sequential batches coming from optimistic
// starts from filling up the buffer size
if epoch < self.to_be_downloaded {
debug!(self.log, "Rejected optimistic batch left for future use"; "epoch" => %epoch, "reason" => reason);
debug!(%epoch, reason, "Rejected optimistic batch left for future use");
// this batch is now treated as any other batch, and re-requested for future use
if redownload {
return self.retry_batch_download(network, epoch);
}
} else {
debug!(self.log, "Rejected optimistic batch"; "epoch" => %epoch, "reason" => reason);
debug!(%epoch, reason, "Rejected optimistic batch");
self.batches.remove(&epoch);
}
}
@@ -621,6 +646,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
/// If a previous batch has been validated and it had been re-processed, penalize the original
/// peer.
#[allow(clippy::modulo_one)]
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
fn advance_chain(&mut self, network: &mut SyncNetworkContext<T>, validating_epoch: Epoch) {
// make sure this epoch produces an advancement
if validating_epoch <= self.start_epoch {
@@ -629,7 +655,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// safety check for batch boundaries
if validating_epoch % EPOCHS_PER_BATCH != self.start_epoch % EPOCHS_PER_BATCH {
crit!(self.log, "Validating Epoch is not aligned");
crit!("Validating Epoch is not aligned");
return;
}
@@ -651,9 +677,10 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// A different peer sent the correct batch, the previous peer did not
// We negatively score the original peer.
let action = PeerAction::LowToleranceError;
debug!(self.log, "Re-processed batch validated. Scoring original peer";
"batch_epoch" => id, "score_adjustment" => %action,
"original_peer" => %attempt.peer_id, "new_peer" => %processed_attempt.peer_id
debug!(
batch_epoch = %id, score_adjustment = %action,
original_peer = %attempt.peer_id, new_peer = %processed_attempt.peer_id,
"Re-processed batch validated. Scoring original peer"
);
network.report_peer(
attempt.peer_id,
@@ -664,9 +691,12 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// The same peer corrected it's previous mistake. There was an error, so we
// negative score the original peer.
let action = PeerAction::MidToleranceError;
debug!(self.log, "Re-processed batch validated by the same peer";
"batch_epoch" => id, "score_adjustment" => %action,
"original_peer" => %attempt.peer_id, "new_peer" => %processed_attempt.peer_id
debug!(
batch_epoch = %id,
score_adjustment = %action,
original_peer = %attempt.peer_id,
new_peer = %processed_attempt.peer_id,
"Re-processed batch validated by the same peer"
);
network.report_peer(
attempt.peer_id,
@@ -683,13 +713,12 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
active_batches.remove(&id);
}
}
BatchState::Failed | BatchState::Poisoned | BatchState::AwaitingDownload => crit!(
self.log,
"batch indicates inconsistent chain state while advancing chain"
),
BatchState::Failed | BatchState::Poisoned | BatchState::AwaitingDownload => {
crit!("batch indicates inconsistent chain state while advancing chain")
}
BatchState::AwaitingProcessing(..) => {}
BatchState::Processing(_) => {
debug!(self.log, "Advancing chain while processing a batch"; "batch" => id, batch);
debug!(batch = %id, %batch, "Advancing chain while processing a batch");
if let Some(processing_id) = self.current_processing_batch {
if id <= processing_id {
self.current_processing_batch = None;
@@ -713,8 +742,12 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
self.optimistic_start = None;
}
}
debug!(self.log, "Chain advanced"; "previous_start" => old_start,
"new_start" => self.start_epoch, "processing_target" => self.processing_target);
debug!(
previous_start = %old_start,
new_start = %self.start_epoch,
processing_target = %self.processing_target,
"Chain advanced"
);
}
/// An invalid batch has been received that could not be processed, but that can be retried.
@@ -722,6 +755,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
/// These events occur when a peer has successfully responded with blocks, but the blocks we
/// have received are incorrect or invalid. This indicates the peer has not performed as
/// intended and can result in downvoting a peer.
#[instrument(parent = None,level = "info", fields(service = self.id, network), skip_all)]
fn handle_invalid_batch(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -781,6 +815,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
/// This chain has been requested to start syncing.
///
/// This could be new chain, or an old chain that is being resumed.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn start_syncing(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -819,6 +854,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
/// Add a peer to the chain.
///
/// If the chain is active, this starts requesting batches from this peer.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn add_peer(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -836,6 +872,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
/// An RPC error has occurred.
///
/// If the batch exists it is re-requested.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn inject_error(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -852,24 +889,21 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// columns.
if !batch.is_expecting_block(&request_id) {
debug!(
self.log,
"Batch not expecting block";
"batch_epoch" => batch_id,
"batch_state" => ?batch.state(),
"peer_id" => %peer_id,
"request_id" => %request_id,
"batch_state" => batch_state
batch_epoch = %batch_id,
batch_state = ?batch.state(),
%peer_id,
%request_id,
?batch_state,
"Batch not expecting block"
);
return Ok(KeepChain);
}
debug!(
self.log,
"Batch failed. RPC Error";
"batch_epoch" => batch_id,
"batch_state" => ?batch.state(),
"peer_id" => %peer_id,
"request_id" => %request_id,
"batch_state" => batch_state
batch_epoch = %batch_id,
batch_state = ?batch.state(),
%peer_id,
%request_id,
"Batch failed. RPC Error"
);
if let Some(active_requests) = self.peers.get_mut(peer_id) {
active_requests.remove(&batch_id);
@@ -883,12 +917,11 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
self.retry_batch_download(network, batch_id)
} else {
debug!(
self.log,
"Batch not found";
"batch_epoch" => batch_id,
"peer_id" => %peer_id,
"request_id" => %request_id,
"batch_state" => batch_state
batch_epoch = %batch_id,
%peer_id,
%request_id,
batch_state,
"Batch not found"
);
// this could be an error for an old batch, removed when the chain advances
Ok(KeepChain)
@@ -896,6 +929,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
}
/// Sends and registers the request of a batch awaiting download.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn retry_batch_download(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -932,6 +966,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
}
/// Requests the batch assigned to the given id from a given peer.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn send_batch(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -958,9 +993,9 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
.map(|epoch| epoch == batch_id)
.unwrap_or(false)
{
debug!(self.log, "Requesting optimistic batch"; "epoch" => batch_id, &batch, "batch_state" => batch_state);
debug!(epoch = %batch_id, %batch, %batch_state, "Requesting optimistic batch");
} else {
debug!(self.log, "Requesting batch"; "epoch" => batch_id, &batch, "batch_state" => batch_state);
debug!(epoch = %batch_id, %batch, %batch_state, "Requesting batch");
}
// register the batch for this peer
return self
@@ -979,8 +1014,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
}
Err(e) => {
// NOTE: under normal conditions this shouldn't happen but we handle it anyway
warn!(self.log, "Could not send batch request";
"batch_id" => batch_id, "error" => ?e, &batch);
warn!(%batch_id, error = %e, %batch, "Could not send batch request");
// register the failed download and check if the batch can be retried
batch.start_downloading_from_peer(peer, 1)?; // fake request_id is not relevant
self.peers
@@ -1005,6 +1039,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
}
/// Returns true if this chain is currently syncing.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn is_syncing(&self) -> bool {
match self.state {
ChainSyncingState::Syncing => true,
@@ -1014,6 +1049,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
/// Kickstarts the chain by sending for processing batches that are ready and requesting more
/// batches if needed.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn resume(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -1026,6 +1062,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
/// Attempts to request the next required batches from the peer pool if the chain is syncing. It will exhaust the peer
/// pool and left over batches until the batch buffer is reached or all peers are exhausted.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
fn request_batches(&mut self, network: &mut SyncNetworkContext<T>) -> ProcessingResult {
if !matches!(self.state, ChainSyncingState::Syncing) {
return Ok(KeepChain);
@@ -1052,10 +1089,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// We wait for this batch before requesting any other batches.
if let Some(epoch) = self.optimistic_start {
if !self.good_peers_on_sampling_subnets(epoch, network) {
debug!(
self.log,
"Waiting for peers to be available on sampling column subnets"
);
debug!("Waiting for peers to be available on sampling column subnets");
return Ok(KeepChain);
}
@@ -1114,6 +1148,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
/// Creates the next required batch from the chain. If there are no more batches required,
/// `false` is returned.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
fn include_next_batch(&mut self, network: &mut SyncNetworkContext<T>) -> Option<BatchId> {
// don't request batches beyond the target head slot
if self
@@ -1147,10 +1182,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// block and data column requests are currently coupled. This can be removed once we find a
// way to decouple the requests and do retries individually, see issue #6258.
if !self.good_peers_on_sampling_subnets(self.to_be_downloaded, network) {
debug!(
self.log,
"Waiting for peers to be available on custody column subnets"
);
debug!("Waiting for peers to be available on custody column subnets");
return None;
}
@@ -1177,6 +1209,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
/// This produces a string of the form: [D,E,E,E,E]
/// to indicate the current buffer state of the chain. The symbols are defined on each of the
/// batch states. See [BatchState::visualize] for symbol definitions.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
fn visualize_batch_state(&self) -> String {
let mut visualization_string = String::with_capacity((BATCH_BUFFER_SIZE * 3) as usize);
@@ -1212,45 +1245,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
}
}
impl<T: BeaconChainTypes> slog::KV for &mut SyncingChain<T> {
fn serialize(
&self,
record: &slog::Record,
serializer: &mut dyn slog::Serializer,
) -> slog::Result {
slog::KV::serialize(*self, record, serializer)
}
}
impl<T: BeaconChainTypes> slog::KV for SyncingChain<T> {
fn serialize(
&self,
record: &slog::Record,
serializer: &mut dyn slog::Serializer,
) -> slog::Result {
use slog::Value;
serializer.emit_u32("id", self.id)?;
Value::serialize(&self.start_epoch, record, "from", serializer)?;
Value::serialize(
&self.target_head_slot.epoch(T::EthSpec::slots_per_epoch()),
record,
"to",
serializer,
)?;
serializer.emit_arguments("end_root", &format_args!("{}", self.target_head_root))?;
Value::serialize(
&self.processing_target,
record,
"current_target",
serializer,
)?;
serializer.emit_usize("batches", self.batches.len())?;
serializer.emit_usize("peers", self.peers.len())?;
serializer.emit_arguments("state", &format_args!("{:?}", self.state))?;
slog::Result::Ok(())
}
}
use super::batch::WrongState as WrongBatchState;
impl From<WrongBatchState> for RemoveChain {
fn from(err: WrongBatchState) -> Self {

View File

@@ -12,11 +12,12 @@ use fnv::FnvHashMap;
use lighthouse_network::service::api_types::Id;
use lighthouse_network::PeerId;
use lighthouse_network::SyncInfo;
use slog::{crit, debug, error};
use logging::crit;
use smallvec::SmallVec;
use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::sync::Arc;
use tracing::{debug, error};
use types::EthSpec;
use types::{Epoch, Hash256, Slot};
@@ -50,18 +51,15 @@ pub struct ChainCollection<T: BeaconChainTypes> {
head_chains: FnvHashMap<ChainId, SyncingChain<T>>,
/// The current sync state of the process.
state: RangeSyncState,
/// Logger for the collection.
log: slog::Logger,
}
impl<T: BeaconChainTypes> ChainCollection<T> {
pub fn new(beacon_chain: Arc<BeaconChain<T>>, log: slog::Logger) -> Self {
pub fn new(beacon_chain: Arc<BeaconChain<T>>) -> Self {
ChainCollection {
beacon_chain,
finalized_chains: FnvHashMap::default(),
head_chains: FnvHashMap::default(),
state: RangeSyncState::Idle,
log,
}
}
@@ -295,9 +293,8 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
.expect("Chain exists");
match old_id {
Some(Some(old_id)) => debug!(self.log, "Switching finalized chains";
"old_id" => old_id, &chain),
None => debug!(self.log, "Syncing new finalized chain"; &chain),
Some(Some(old_id)) => debug!(old_id, %chain, "Switching finalized chains"),
None => debug!(%chain, "Syncing new finalized chain"),
Some(None) => {
// this is the same chain. We try to advance it.
}
@@ -309,10 +306,10 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
if let Err(remove_reason) = chain.start_syncing(network, local_epoch, local_head_epoch)
{
if remove_reason.is_critical() {
crit!(self.log, "Chain removed while switching chains"; "chain" => new_id, "reason" => ?remove_reason);
crit!(chain = new_id, reason = ?remove_reason, "Chain removed while switching chains");
} else {
// this happens only if sending a batch over the `network` fails a lot
error!(self.log, "Chain removed while switching chains"; "chain" => new_id, "reason" => ?remove_reason);
error!(chain = new_id, reason = ?remove_reason, "Chain removed while switching chains");
}
self.finalized_chains.remove(&new_id);
self.on_chain_removed(&new_id, true, RangeSyncType::Finalized);
@@ -330,7 +327,7 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
) {
// Include the awaiting head peers
for (peer_id, peer_sync_info) in awaiting_head_peers.drain() {
debug!(self.log, "including head peer");
debug!("including head peer");
self.add_peer_or_create_chain(
local_epoch,
peer_sync_info.head_root,
@@ -362,16 +359,16 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
if syncing_chains.len() < PARALLEL_HEAD_CHAINS {
// start this chain if it's not already syncing
if !chain.is_syncing() {
debug!(self.log, "New head chain started syncing"; &chain);
debug!(%chain, "New head chain started syncing");
}
if let Err(remove_reason) =
chain.start_syncing(network, local_epoch, local_head_epoch)
{
self.head_chains.remove(&id);
if remove_reason.is_critical() {
crit!(self.log, "Chain removed while switching head chains"; "chain" => id, "reason" => ?remove_reason);
crit!(chain = id, reason = ?remove_reason, "Chain removed while switching head chains");
} else {
error!(self.log, "Chain removed while switching head chains"; "chain" => id, "reason" => ?remove_reason);
error!(chain = id, reason = ?remove_reason, "Chain removed while switching head chains");
}
} else {
syncing_chains.push(id);
@@ -407,7 +404,6 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
.start_slot(T::EthSpec::slots_per_epoch());
let beacon_chain = &self.beacon_chain;
let log_ref = &self.log;
let is_outdated = |target_slot: &Slot, target_root: &Hash256| {
target_slot <= &local_finalized_slot
@@ -425,7 +421,7 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
if is_outdated(&chain.target_head_slot, &chain.target_head_root)
|| chain.available_peers() == 0
{
debug!(log_ref, "Purging out of finalized chain"; &chain);
debug!(%chain, "Purging out of finalized chain");
Some((*id, chain.is_syncing(), RangeSyncType::Finalized))
} else {
None
@@ -436,7 +432,7 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
if is_outdated(&chain.target_head_slot, &chain.target_head_root)
|| chain.available_peers() == 0
{
debug!(log_ref, "Purging out of date head chain"; &chain);
debug!(%chain, "Purging out of date head chain");
Some((*id, chain.is_syncing(), RangeSyncType::Head))
} else {
None
@@ -477,14 +473,14 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
.find(|(_, chain)| chain.has_same_target(target_head_slot, target_head_root))
{
Some((&id, chain)) => {
debug!(self.log, "Adding peer to known chain"; "peer_id" => %peer, "sync_type" => ?sync_type, "id" => id);
debug!(peer_id = %peer, ?sync_type, id, "Adding peer to known chain");
debug_assert_eq!(chain.target_head_root, target_head_root);
debug_assert_eq!(chain.target_head_slot, target_head_slot);
if let Err(remove_reason) = chain.add_peer(network, peer) {
if remove_reason.is_critical() {
crit!(self.log, "Chain removed after adding peer"; "chain" => id, "reason" => ?remove_reason);
crit!(chain = %id, reason = ?remove_reason, "Chain removed after adding peer");
} else {
error!(self.log, "Chain removed after adding peer"; "chain" => id, "reason" => ?remove_reason);
error!(chain = %id, reason = ?remove_reason, "Chain removed after adding peer");
}
let is_syncing = chain.is_syncing();
collection.remove(&id);
@@ -501,9 +497,9 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
target_head_root,
peer,
sync_type.into(),
&self.log,
);
debug!(self.log, "New chain added to sync"; "peer_id" => peer_rpr, "sync_type" => ?sync_type, &new_chain);
debug!(peer_id = peer_rpr, ?sync_type, %new_chain, "New chain added to sync");
collection.insert(id, new_chain);
metrics::inc_counter_vec(&metrics::SYNCING_CHAINS_ADDED, &[sync_type.as_str()]);
self.update_metrics();

View File

@@ -51,10 +51,11 @@ use beacon_chain::{BeaconChain, BeaconChainTypes};
use lighthouse_network::rpc::GoodbyeReason;
use lighthouse_network::service::api_types::Id;
use lighthouse_network::{PeerId, SyncInfo};
use logging::crit;
use lru_cache::LRUTimeCache;
use slog::{crit, debug, trace, warn};
use std::collections::HashMap;
use std::sync::Arc;
use tracing::{debug, instrument, trace, warn};
use types::{Epoch, EthSpec, Hash256};
/// For how long we store failed finalized chains to prevent retries.
@@ -74,23 +75,26 @@ pub struct RangeSync<T: BeaconChainTypes> {
chains: ChainCollection<T>,
/// Chains that have failed and are stored to prevent being retried.
failed_chains: LRUTimeCache<Hash256>,
/// The syncing logger.
log: slog::Logger,
}
impl<T: BeaconChainTypes> RangeSync<T>
where
T: BeaconChainTypes,
{
pub fn new(beacon_chain: Arc<BeaconChain<T>>, log: slog::Logger) -> Self {
#[instrument(parent = None,
level = "info",
fields(component = "range_sync"),
name = "range_sync",
skip_all
)]
pub fn new(beacon_chain: Arc<BeaconChain<T>>) -> Self {
RangeSync {
beacon_chain: beacon_chain.clone(),
chains: ChainCollection::new(beacon_chain, log.clone()),
chains: ChainCollection::new(beacon_chain),
failed_chains: LRUTimeCache::new(std::time::Duration::from_secs(
FAILED_CHAINS_EXPIRY_SECONDS,
)),
awaiting_head_peers: HashMap::new(),
log,
}
}
@@ -99,6 +103,12 @@ where
self.failed_chains.keys().copied().collect()
}
#[instrument(parent = None,
level = "info",
fields(component = "range_sync"),
name = "range_sync",
skip_all
)]
pub fn state(&self) -> SyncChainStatus {
self.chains.state()
}
@@ -108,6 +118,12 @@ where
/// may need to be synced as a result. A new peer, may increase the peer pool of a finalized
/// chain, this may result in a different finalized chain from syncing as finalized chains are
/// prioritised by peer-pool size.
#[instrument(parent = None,
level = "info",
fields(component = "range_sync"),
name = "range_sync",
skip_all
)]
pub fn add_peer(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -133,14 +149,13 @@ where
RangeSyncType::Finalized => {
// Make sure we have not recently tried this chain
if self.failed_chains.contains(&remote_info.finalized_root) {
debug!(self.log, "Disconnecting peer that belongs to previously failed chain";
"failed_root" => %remote_info.finalized_root, "peer_id" => %peer_id);
debug!(failed_root = ?remote_info.finalized_root, %peer_id,"Disconnecting peer that belongs to previously failed chain");
network.goodbye_peer(peer_id, GoodbyeReason::IrrelevantNetwork);
return;
}
// Finalized chain search
debug!(self.log, "Finalization sync peer joined"; "peer_id" => %peer_id);
debug!(%peer_id, "Finalization sync peer joined");
self.awaiting_head_peers.remove(&peer_id);
// Because of our change in finalized sync batch size from 2 to 1 and our transition
@@ -171,8 +186,7 @@ where
if self.chains.is_finalizing_sync() {
// If there are finalized chains to sync, finish these first, before syncing head
// chains.
trace!(self.log, "Waiting for finalized sync to complete";
"peer_id" => %peer_id, "awaiting_head_peers" => &self.awaiting_head_peers.len());
trace!(%peer_id, awaiting_head_peers = &self.awaiting_head_peers.len(),"Waiting for finalized sync to complete");
self.awaiting_head_peers.insert(peer_id, remote_info);
return;
}
@@ -204,6 +218,12 @@ where
///
/// This function finds the chain that made this request. Once found, processes the result.
/// This request could complete a chain or simply add to its progress.
#[instrument(parent = None,
level = "info",
fields(component = "range_sync"),
name = "range_sync",
skip_all
)]
pub fn blocks_by_range_response(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -229,11 +249,17 @@ where
}
}
Err(_) => {
trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
trace!(%chain_id, "BlocksByRange response for removed chain")
}
}
}
#[instrument(parent = None,
level = "info",
fields(component = "range_sync"),
name = "range_sync",
skip_all
)]
pub fn handle_block_process_result(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -259,13 +285,19 @@ where
}
Err(_) => {
trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
trace!(%chain_id, "BlocksByRange response for removed chain")
}
}
}
/// A peer has disconnected. This removes the peer from any ongoing chains and mappings. A
/// disconnected peer could remove a chain
#[instrument(parent = None,
level = "info",
fields(component = "range_sync"),
name = "range_sync",
skip_all
)]
pub fn peer_disconnect(&mut self, network: &mut SyncNetworkContext<T>, peer_id: &PeerId) {
// if the peer is in the awaiting head mapping, remove it
self.awaiting_head_peers.remove(peer_id);
@@ -278,6 +310,12 @@ where
/// which pool the peer is in. The chain may also have a batch or batches awaiting
/// for this peer. If so we mark the batch as failed. The batch may then hit it's maximum
/// retries. In this case, we need to remove the chain.
#[instrument(parent = None,
level = "info",
fields(component = "range_sync"),
name = "range_sync",
skip_all
)]
fn remove_peer(&mut self, network: &mut SyncNetworkContext<T>, peer_id: &PeerId) {
for (removed_chain, sync_type, remove_reason) in self
.chains
@@ -297,6 +335,12 @@ where
///
/// Check to see if the request corresponds to a pending batch. If so, re-request it if possible, if there have
/// been too many failed attempts for the batch, remove the chain.
#[instrument(parent = None,
level = "info",
fields(component = "range_sync"),
name = "range_sync",
skip_all
)]
pub fn inject_error(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -321,11 +365,17 @@ where
}
}
Err(_) => {
trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
trace!(%chain_id, "BlocksByRange response for removed chain")
}
}
}
#[instrument(parent = None,
level = "info",
fields(component = "range_sync"),
name = "range_sync",
skip_all
)]
fn on_chain_removed(
&mut self,
chain: SyncingChain<T>,
@@ -335,14 +385,18 @@ where
op: &'static str,
) {
if remove_reason.is_critical() {
crit!(self.log, "Chain removed"; "sync_type" => ?sync_type, &chain, "reason" => ?remove_reason, "op" => op);
crit!(?sync_type, %chain, reason = ?remove_reason,op, "Chain removed");
} else {
debug!(self.log, "Chain removed"; "sync_type" => ?sync_type, &chain, "reason" => ?remove_reason, "op" => op);
debug!(?sync_type, %chain, reason = ?remove_reason,op, "Chain removed");
}
if let RemoveChain::ChainFailed { blacklist, .. } = remove_reason {
if RangeSyncType::Finalized == sync_type && blacklist {
warn!(self.log, "Chain failed! Syncing to its head won't be retried for at least the next {} seconds", FAILED_CHAINS_EXPIRY_SECONDS; &chain);
warn!(
%chain,
"Chain failed! Syncing to its head won't be retried for at least the next {} seconds",
FAILED_CHAINS_EXPIRY_SECONDS
);
self.failed_chains.insert(chain.target_head_root);
}
}
@@ -369,6 +423,12 @@ where
}
/// Kickstarts sync.
#[instrument(parent = None,
level = "info",
fields(component = "range_sync"),
name = "range_sync",
skip_all
)]
pub fn resume(&mut self, network: &mut SyncNetworkContext<T>) {
for (removed_chain, sync_type, remove_reason) in
self.chains.call_all(|chain| chain.resume(network))