Integrate tracing (#6339)

Tracing Integration
- [reference](5bbf1859e9/projects/project-ideas.md (L297))


  - [x] replace slog & log with tracing throughout the codebase
- [x] implement custom crit log
- [x] make relevant changes in the formatter
- [x] replace sloggers
- [x] re-write SSE logging components

cc: @macladson @eserilev
This commit is contained in:
ThreeHrSleep
2025-03-13 04:01:05 +05:30
committed by GitHub
parent f23f984f85
commit d60c24ef1c
241 changed files with 9485 additions and 9328 deletions

View File

@@ -20,13 +20,14 @@ use beacon_chain::{BeaconChain, BeaconChainTypes};
use lighthouse_network::service::api_types::Id;
use lighthouse_network::types::{BackFillState, NetworkGlobals};
use lighthouse_network::{PeerAction, PeerId};
use logging::crit;
use rand::seq::SliceRandom;
use slog::{crit, debug, error, info, warn};
use std::collections::{
btree_map::{BTreeMap, Entry},
HashMap, HashSet,
};
use std::sync::Arc;
use tracing::{debug, error, info, instrument, warn};
use types::{Epoch, EthSpec};
/// Blocks are downloaded in batches from peers. This constant specifies how many epochs worth of
@@ -146,16 +147,17 @@ pub struct BackFillSync<T: BeaconChainTypes> {
/// Reference to the network globals in order to obtain valid peers to backfill blocks from
/// (i.e synced peers).
network_globals: Arc<NetworkGlobals<T::EthSpec>>,
/// A logger for backfill sync.
log: slog::Logger,
}
impl<T: BeaconChainTypes> BackFillSync<T> {
#[instrument(parent = None,
level = "info",
name = "backfill_sync",
skip_all
)]
pub fn new(
beacon_chain: Arc<BeaconChain<T>>,
network_globals: Arc<NetworkGlobals<T::EthSpec>>,
log: slog::Logger,
) -> Self {
// Determine if backfill is enabled or not.
// If, for some reason a backfill has already been completed (or we've used a trusted
@@ -186,7 +188,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
participating_peers: HashSet::new(),
restart_failed_sync: false,
beacon_chain,
log,
};
// Update the global network state with the current backfill state.
@@ -195,9 +196,15 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
}
/// Pauses the backfill sync if it's currently syncing.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
pub fn pause(&mut self) {
if let BackFillState::Syncing = self.state() {
debug!(self.log, "Backfill sync paused"; "processed_epochs" => self.validated_batches, "to_be_processed" => self.current_start);
debug!(processed_epochs = %self.validated_batches, to_be_processed = %self.current_start,"Backfill sync paused");
self.set_state(BackFillState::Paused);
}
}
@@ -206,6 +213,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
///
/// If resuming is successful, reports back the current syncing metrics.
#[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"]
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
pub fn start(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -222,7 +235,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
.is_some()
{
// If there are peers to resume with, begin the resume.
debug!(self.log, "Resuming backfill sync"; "start_epoch" => self.current_start, "awaiting_batches" => self.batches.len(), "processing_target" => self.processing_target);
debug!(start_epoch = ?self.current_start, awaiting_batches = self.batches.len(), processing_target = ?self.processing_target, "Resuming backfill sync");
self.set_state(BackFillState::Syncing);
// Resume any previously failed batches.
self.resume_batches(network)?;
@@ -251,14 +264,14 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
// This infallible match exists to force us to update this code if a future
// refactor of `ResetEpochError` adds a variant.
let ResetEpochError::SyncCompleted = e;
error!(self.log, "Backfill sync completed whilst in failed status");
error!("Backfill sync completed whilst in failed status");
self.set_state(BackFillState::Completed);
return Err(BackFillError::InvalidSyncState(String::from(
"chain completed",
)));
}
debug!(self.log, "Resuming a failed backfill sync"; "start_epoch" => self.current_start);
debug!(start_epoch = %self.current_start, "Resuming a failed backfill sync");
// begin requesting blocks from the peer pool, until all peers are exhausted.
self.request_batches(network)?;
@@ -281,6 +294,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// A fully synced peer has joined us.
/// If we are in a failed state, update a local variable to indicate we are able to restart
/// the failed sync on the next attempt.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
pub fn fully_synced_peer_joined(&mut self) {
if matches!(self.state(), BackFillState::Failed) {
self.restart_failed_sync = true;
@@ -289,6 +308,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// A peer has disconnected.
/// If the peer has active batches, those are considered failed and re-requested.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
#[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"]
pub fn peer_disconnected(
&mut self,
@@ -318,15 +343,13 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
// short circuit early.
if self.retry_batch_download(network, id).is_err() {
debug!(
self.log,
"Batch could not be retried";
"batch_id" => id,
"error" => "no synced peers"
batch_id = %id,
error = "no synced peers",
"Batch could not be retried"
);
}
} else {
debug!(self.log, "Batch not found while removing peer";
"peer" => %peer_id, "batch" => id)
debug!(peer = %peer_id, batch = %id, "Batch not found while removing peer");
}
}
}
@@ -339,6 +362,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// An RPC error has occurred.
///
/// If the batch exists it is re-requested.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
#[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"]
pub fn inject_error(
&mut self,
@@ -356,7 +385,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
if !batch.is_expecting_block(&request_id) {
return Ok(());
}
debug!(self.log, "Batch failed"; "batch_epoch" => batch_id, "error" => "rpc_error");
debug!(batch_epoch = %batch_id, error = "rpc_error", "Batch failed");
if let Some(active_requests) = self.active_requests.get_mut(peer_id) {
active_requests.remove(&batch_id);
}
@@ -378,6 +407,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// If this returns an error, the backfill sync has failed and will be restarted once new peers
/// join the system.
/// The sync manager should update the global sync state on failure.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
#[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"]
pub fn on_block_response(
&mut self,
@@ -391,7 +426,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
let Some(batch) = self.batches.get_mut(&batch_id) else {
if !matches!(self.state(), BackFillState::Failed) {
// A batch might get removed when the chain advances, so this is non fatal.
debug!(self.log, "Received a block for unknown batch"; "epoch" => batch_id);
debug!(epoch = %batch_id, "Received a block for unknown batch");
}
return Ok(ProcessResult::Successful);
};
@@ -416,7 +451,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
Ok(received) => {
let awaiting_batches =
self.processing_target.saturating_sub(batch_id) / BACKFILL_EPOCHS_PER_BATCH;
debug!(self.log, "Completed batch received"; "epoch" => batch_id, "blocks" => received, "awaiting_batches" => awaiting_batches);
debug!(
epoch = %batch_id,
blocks = received,
%awaiting_batches,
"Completed batch received"
);
// pre-emptively request more blocks from peers whilst we process current blocks,
self.request_batches(network)?;
@@ -432,6 +472,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// The syncing process has failed.
///
/// This resets past variables, to allow for a fresh start when resuming.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn fail_sync(&mut self, error: BackFillError) -> Result<(), BackFillError> {
// Some errors shouldn't fail the chain.
if matches!(error, BackFillError::Paused) {
@@ -455,7 +501,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
// NOTE: Lets keep validated_batches for posterity
// Emit the log here
error!(self.log, "Backfill sync failed"; "error" => ?error);
error!(?error, "Backfill sync failed");
// Return the error, kinda weird pattern, but I want to use
// `self.fail_chain(_)?` in other parts of the code.
@@ -464,6 +510,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// Processes the batch with the given id.
/// The batch must exist and be ready for processing
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn process_batch(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -503,8 +555,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
.beacon_processor()
.send_chain_segment(process_id, blocks)
{
crit!(self.log, "Failed to send backfill segment to processor."; "msg" => "process_batch",
"error" => %e, "batch" => self.processing_target);
crit!(
msg = "process_batch",
error = %e,
batch = ?self.processing_target,
"Failed to send backfill segment to processor."
);
// This is unlikely to happen but it would stall syncing since the batch now has no
// blocks to continue, and the chain is expecting a processing result that won't
// arrive. To mitigate this, (fake) fail this processing so that the batch is
@@ -518,6 +574,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// The block processor has completed processing a batch. This function handles the result
/// of the batch processor.
/// If an error is returned the BackFill sync has failed.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
#[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"]
pub fn on_batch_process_result(
&mut self,
@@ -530,13 +592,15 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
// result
let batch = match &self.current_processing_batch {
Some(processing_id) if *processing_id != batch_id => {
debug!(self.log, "Unexpected batch result";
"batch_epoch" => batch_id, "expected_batch_epoch" => processing_id);
debug!(
batch_epoch = %batch_id.as_u64(),
expected_batch_epoch = processing_id.as_u64(),
"Unexpected batch result"
);
return Ok(ProcessResult::Successful);
}
None => {
debug!(self.log, "Chain was not expecting a batch result";
"batch_epoch" => batch_id);
debug!(%batch_id, "Chain was not expecting a batch result");
return Ok(ProcessResult::Successful);
}
_ => {
@@ -566,8 +630,14 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
return Ok(ProcessResult::Successful);
};
debug!(self.log, "Backfill batch processed"; "result" => ?result, &batch,
"batch_epoch" => batch_id, "peer" => %peer, "client" => %network.client_type(peer));
debug!(
?result,
%batch,
batch_epoch = %batch_id,
%peer,
client = %network.client_type(peer),
"Backfill batch processed"
);
match result {
BatchProcessResult::Success {
@@ -591,7 +661,10 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
// check if the chain has completed syncing
if self.check_completed() {
// chain is completed
info!(self.log, "Backfill sync completed"; "blocks_processed" => self.validated_batches * T::EthSpec::slots_per_epoch());
info!(
blocks_processed = self.validated_batches * T::EthSpec::slots_per_epoch(),
"Backfill sync completed"
);
self.set_state(BackFillState::Completed);
Ok(ProcessResult::SyncCompleted)
} else {
@@ -619,10 +692,9 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
// repeatedly and are either malicious or faulty. We stop the backfill sync and
// report all synced peers that have participated.
warn!(
self.log,
"Backfill batch failed to download. Penalizing peers";
"score_adjustment" => %penalty,
"batch_epoch"=> batch_id
score_adjustment = %penalty,
batch_epoch = %batch_id,
"Backfill batch failed to download. Penalizing peers"
);
for peer in self.participating_peers.drain() {
@@ -658,6 +730,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
}
/// Processes the next ready batch.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn process_completed_batches(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -692,7 +770,10 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
BatchState::AwaitingValidation(_) => {
// TODO: I don't think this state is possible, log a CRIT just in case.
// If this is not observed, add it to the failed state branch above.
crit!(self.log, "Chain encountered a robust batch awaiting validation"; "batch" => self.processing_target);
crit!(
batch = ?self.processing_target,
"Chain encountered a robust batch awaiting validation"
);
self.processing_target -= BACKFILL_EPOCHS_PER_BATCH;
if self.to_be_downloaded >= self.processing_target {
@@ -718,6 +799,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
///
/// If a previous batch has been validated and it had been re-processed, penalize the original
/// peer.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn advance_chain(&mut self, network: &mut SyncNetworkContext<T>, validating_epoch: Epoch) {
// make sure this epoch produces an advancement
if validating_epoch >= self.current_start {
@@ -745,9 +832,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
// A different peer sent the correct batch, the previous peer did not
// We negatively score the original peer.
let action = PeerAction::LowToleranceError;
debug!(self.log, "Re-processed batch validated. Scoring original peer";
"batch_epoch" => id, "score_adjustment" => %action,
"original_peer" => %attempt.peer_id, "new_peer" => %processed_attempt.peer_id
debug!(
batch_epoch = ?id,
score_adjustment = %action,
original_peer = %attempt.peer_id,
new_peer = %processed_attempt.peer_id,
"Re-processed batch validated. Scoring original peer"
);
network.report_peer(
attempt.peer_id,
@@ -758,9 +848,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
// The same peer corrected it's previous mistake. There was an error, so we
// negative score the original peer.
let action = PeerAction::MidToleranceError;
debug!(self.log, "Re-processed batch validated by the same peer";
"batch_epoch" => id, "score_adjustment" => %action,
"original_peer" => %attempt.peer_id, "new_peer" => %processed_attempt.peer_id
debug!(
batch_epoch = ?id,
score_adjustment = %action,
original_peer = %attempt.peer_id,
new_peer = %processed_attempt.peer_id,
"Re-processed batch validated by the same peer"
);
network.report_peer(
attempt.peer_id,
@@ -778,14 +871,11 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
}
}
BatchState::Failed | BatchState::Poisoned | BatchState::AwaitingDownload => {
crit!(
self.log,
"batch indicates inconsistent chain state while advancing chain"
)
crit!("batch indicates inconsistent chain state while advancing chain")
}
BatchState::AwaitingProcessing(..) => {}
BatchState::Processing(_) => {
debug!(self.log, "Advancing chain while processing a batch"; "batch" => id, batch);
debug!(batch = %id, %batch, "Advancing chain while processing a batch");
if let Some(processing_id) = self.current_processing_batch {
if id >= processing_id {
self.current_processing_batch = None;
@@ -803,7 +893,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
// won't have this batch, so we need to request it.
self.to_be_downloaded -= BACKFILL_EPOCHS_PER_BATCH;
}
debug!(self.log, "Backfill advanced"; "validated_epoch" => validating_epoch, "processing_target" => self.processing_target);
debug!(?validating_epoch, processing_target = ?self.processing_target, "Backfill advanced");
}
/// An invalid batch has been received that could not be processed, but that can be retried.
@@ -811,6 +901,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// These events occur when a peer has successfully responded with blocks, but the blocks we
/// have received are incorrect or invalid. This indicates the peer has not performed as
/// intended and can result in downvoting a peer.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn handle_invalid_batch(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -862,6 +958,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
}
/// Sends and registers the request of a batch awaiting download.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn retry_batch_download(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -896,13 +998,19 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
self.send_batch(network, batch_id, peer)
} else {
// If we are here the chain has no more synced peers
info!(self.log, "Backfill sync paused"; "reason" => "insufficient_synced_peers");
info!(reason = "insufficient_synced_peers", "Backfill sync paused");
self.set_state(BackFillState::Paused);
Err(BackFillError::Paused)
}
}
/// Requests the batch assigned to the given id from a given peer.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn send_batch(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -922,7 +1030,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
if let Err(e) = batch.start_downloading_from_peer(peer, request_id) {
return self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0));
}
debug!(self.log, "Requesting batch"; "epoch" => batch_id, &batch);
debug!(epoch = %batch_id, %batch, "Requesting batch");
// register the batch for this peer
self.active_requests
@@ -933,8 +1041,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
}
Err(e) => {
// NOTE: under normal conditions this shouldn't happen but we handle it anyway
warn!(self.log, "Could not send batch request";
"batch_id" => batch_id, "error" => ?e, &batch);
warn!(%batch_id, error = ?e, %batch,"Could not send batch request");
// register the failed download and check if the batch can be retried
if let Err(e) = batch.start_downloading_from_peer(peer, 1) {
return self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0));
@@ -963,6 +1070,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// When resuming a chain, this function searches for batches that need to be re-downloaded and
/// transitions their state to redownload the batch.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn resume_batches(&mut self, network: &mut SyncNetworkContext<T>) -> Result<(), BackFillError> {
let batch_ids_to_retry = self
.batches
@@ -987,6 +1100,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// Attempts to request the next required batches from the peer pool if the chain is syncing. It will exhaust the peer
/// pool and left over batches until the batch buffer is reached or all peers are exhausted.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn request_batches(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -1029,6 +1148,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// Creates the next required batch from the chain. If there are no more batches required,
/// `false` is returned.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn include_next_batch(&mut self, network: &mut SyncNetworkContext<T>) -> Option<BatchId> {
// don't request batches beyond genesis;
if self.last_batch_downloaded {
@@ -1090,6 +1215,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
///
/// This errors if the beacon chain indicates that backfill sync has already completed or is
/// not required.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn reset_start_epoch(&mut self) -> Result<(), ResetEpochError> {
let anchor_info = self.beacon_chain.store.get_anchor_info();
if anchor_info.block_backfill_complete(self.beacon_chain.genesis_backfill_slot) {
@@ -1103,6 +1234,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
}
/// Checks with the beacon chain if backfill sync has completed.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn check_completed(&mut self) -> bool {
if self.would_complete(self.current_start) {
// Check that the beacon chain agrees
@@ -1111,13 +1248,19 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
if anchor_info.block_backfill_complete(self.beacon_chain.genesis_backfill_slot) {
return true;
} else {
error!(self.log, "Backfill out of sync with beacon chain");
error!("Backfill out of sync with beacon chain");
}
}
false
}
/// Checks if backfill would complete by syncing to `start_epoch`.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn would_complete(&self, start_epoch: Epoch) -> bool {
start_epoch
<= self
@@ -1127,10 +1270,22 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
}
/// Updates the global network state indicating the current state of a backfill sync.
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn set_state(&self, state: BackFillState) {
*self.network_globals.backfill_state.write() = state;
}
#[instrument(parent = None,
level = "info",
fields(service = "backfill_sync"),
name = "backfill_sync",
skip_all
)]
fn state(&self) -> BackFillState {
self.network_globals.backfill_state.read().clone()
}

View File

@@ -41,11 +41,11 @@ use lighthouse_network::service::api_types::SingleLookupReqId;
use lighthouse_network::{PeerAction, PeerId};
use lru_cache::LRUTimeCache;
pub use single_block_lookup::{BlobRequestState, BlockRequestState, CustodyRequestState};
use slog::{debug, error, warn, Logger};
use std::collections::hash_map::Entry;
use std::sync::Arc;
use std::time::Duration;
use store::Hash256;
use tracing::{debug, error, instrument, warn};
use types::{BlobSidecar, DataColumnSidecar, EthSpec, SignedBeaconBlock};
pub mod common;
@@ -116,9 +116,6 @@ pub struct BlockLookups<T: BeaconChainTypes> {
// TODO: Why not index lookups by block_root?
single_block_lookups: FnvHashMap<SingleLookupId, SingleBlockLookup<T>>,
/// The logger for the import manager.
log: Logger,
}
#[cfg(test)]
@@ -130,27 +127,45 @@ use lighthouse_network::service::api_types::Id;
pub(crate) type BlockLookupSummary = (Id, Hash256, Option<Hash256>, Vec<PeerId>);
impl<T: BeaconChainTypes> BlockLookups<T> {
pub fn new(log: Logger) -> Self {
#[instrument(parent = None,level = "info", fields(service = "lookup_sync"), name = "lookup_sync")]
pub fn new() -> Self {
Self {
failed_chains: LRUTimeCache::new(Duration::from_secs(
FAILED_CHAINS_CACHE_EXPIRY_SECONDS,
)),
single_block_lookups: Default::default(),
log,
}
}
#[cfg(test)]
#[instrument(parent = None,
level = "info",
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub(crate) fn insert_failed_chain(&mut self, block_root: Hash256) {
self.failed_chains.insert(block_root);
}
#[cfg(test)]
#[instrument(parent = None,
level = "info",
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub(crate) fn get_failed_chains(&mut self) -> Vec<Hash256> {
self.failed_chains.keys().cloned().collect()
}
#[cfg(test)]
#[instrument(parent = None,
level = "info",
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub(crate) fn active_single_lookups(&self) -> Vec<BlockLookupSummary> {
self.single_block_lookups
.iter()
@@ -159,6 +174,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
}
/// Returns a vec of all parent lookup chains by tip, in descending slot order (tip first)
#[instrument(parent = None,
level = "info",
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub(crate) fn active_parent_lookups(&self) -> Vec<NodeChain> {
compute_parent_chains(
&self
@@ -173,6 +194,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
/// Creates a parent lookup for the block with the given `block_root` and immediately triggers it.
/// If a parent lookup exists or is triggered, a current lookup will be created.
#[instrument(parent = None,
level = "info",
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub fn search_child_and_parent(
&mut self,
block_root: Hash256,
@@ -202,6 +229,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
/// Seach a block whose parent root is unknown.
/// Returns true if the lookup is created or already exists
#[instrument(parent = None,
level = "info",
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub fn search_unknown_block(
&mut self,
block_root: Hash256,
@@ -217,6 +250,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
/// - `block_root_to_search` is a failed chain
///
/// Returns true if the lookup is created or already exists
#[instrument(parent = None,
level = "info",
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub fn search_parent_of_child(
&mut self,
block_root_to_search: Hash256,
@@ -238,7 +277,7 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
if (block_would_extend_chain || trigger_is_chain_tip)
&& parent_chain.len() >= PARENT_DEPTH_TOLERANCE
{
debug!(self.log, "Parent lookup chain too long"; "block_root" => ?block_root_to_search);
debug!(block_root = ?block_root_to_search, "Parent lookup chain too long");
// Searching for this parent would extend a parent chain over the max
// Insert the tip only to failed chains
@@ -283,9 +322,10 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
});
} else {
// Should never happen, log error and continue the lookup drop
error!(self.log, "Unable to transition lookup to range sync";
"error" => "Parent chain tip lookup not found",
"block_root" => ?parent_chain_tip
error!(
error = "Parent chain tip lookup not found",
block_root = ?parent_chain_tip,
"Unable to transition lookup to range sync"
);
}
@@ -299,9 +339,10 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
self.drop_lookup_and_children(*lookup_id);
} else {
// Should never happen
error!(self.log, "Unable to transition lookup to range sync";
"error" => "Block to drop lookup not found",
"block_root" => ?block_to_drop
error!(
error = "Block to drop lookup not found",
block_root = ?block_to_drop,
"Unable to transition lookup to range sync"
);
}
@@ -316,6 +357,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
/// Searches for a single block hash. If the blocks parent is unknown, a chain of blocks is
/// constructed.
/// Returns true if the lookup is created or already exists
#[instrument(parent = None,
level = "info",
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
fn new_current_lookup(
&mut self,
block_root: Hash256,
@@ -326,7 +373,7 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
) -> bool {
// If this block or it's parent is part of a known failed chain, ignore it.
if self.failed_chains.contains(&block_root) {
debug!(self.log, "Block is from a past failed chain. Dropping"; "block_root" => ?block_root);
debug!(?block_root, "Block is from a past failed chain. Dropping");
for peer_id in peers {
cx.report_peer(*peer_id, PeerAction::MidToleranceError, "failed_chain");
}
@@ -343,12 +390,15 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
let component_type = block_component.get_type();
let imported = lookup.add_child_components(block_component);
if !imported {
debug!(self.log, "Lookup child component ignored"; "block_root" => ?block_root, "type" => component_type);
debug!(
?block_root,
component_type, "Lookup child component ignored"
);
}
}
if let Err(e) = self.add_peers_to_lookup_and_ancestors(lookup_id, peers, cx) {
warn!(self.log, "Error adding peers to ancestor lookup"; "error" => ?e);
warn!(error = ?e, "Error adding peers to ancestor lookup");
}
return true;
@@ -361,7 +411,7 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
.iter()
.any(|(_, lookup)| lookup.is_for_block(awaiting_parent))
{
warn!(self.log, "Ignoring child lookup parent lookup not found"; "block_root" => ?awaiting_parent);
warn!(block_root = ?awaiting_parent, "Ignoring child lookup parent lookup not found");
return false;
}
}
@@ -369,7 +419,7 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
// Lookups contain untrusted data, bound the total count of lookups hold in memory to reduce
// the risk of OOM in case of bugs of malicious activity.
if self.single_block_lookups.len() > MAX_LOOKUPS {
warn!(self.log, "Dropping lookup reached max"; "block_root" => ?block_root);
warn!(?block_root, "Dropping lookup reached max");
return false;
}
@@ -387,18 +437,19 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
Entry::Vacant(entry) => entry.insert(lookup),
Entry::Occupied(_) => {
// Should never happen
warn!(self.log, "Lookup exists with same id"; "id" => id);
warn!(id, "Lookup exists with same id");
return false;
}
};
debug!(
self.log,
"Created block lookup";
"peer_ids" => ?peers,
"block_root" => ?block_root,
"awaiting_parent" => awaiting_parent.map(|root| root.to_string()).unwrap_or("none".to_owned()),
"id" => lookup.id,
?peers,
?block_root,
awaiting_parent = awaiting_parent
.map(|root| root.to_string())
.unwrap_or("none".to_owned()),
id = lookup.id,
"Created block lookup"
);
metrics::inc_counter(&metrics::SYNC_LOOKUP_CREATED);
@@ -414,6 +465,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
/* Lookup responses */
/// Process a block or blob response received from a single lookup request.
#[instrument(parent = None,
level = "info",
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub fn on_download_response<R: RequestState<T>>(
&mut self,
id: SingleLookupReqId,
@@ -437,7 +494,7 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
let Some(lookup) = self.single_block_lookups.get_mut(&id.lookup_id) else {
// We don't have the ability to cancel in-flight RPC requests. So this can happen
// if we started this RPC request, and later saw the block/blobs via gossip.
debug!(self.log, "Block returned for single block lookup not present"; "id" => ?id);
debug!(?id, "Block returned for single block lookup not present");
return Err(LookupRequestError::UnknownLookup);
};
@@ -448,12 +505,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
match response {
Ok((response, peer_group, seen_timestamp)) => {
debug!(self.log,
"Received lookup download success";
"block_root" => ?block_root,
"id" => ?id,
"peer_group" => ?peer_group,
"response_type" => ?response_type,
debug!(
?block_root,
?id,
?peer_group,
?response_type,
"Received lookup download success"
);
// Here we could check if response extends a parent chain beyond its max length.
@@ -481,12 +538,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
Err(e) => {
// No need to log peer source here. When sending a DataColumnsByRoot request we log
// the peer and the request ID which is linked to this `id` value here.
debug!(self.log,
"Received lookup download failure";
"block_root" => ?block_root,
"id" => ?id,
"response_type" => ?response_type,
"error" => ?e,
debug!(
?block_root,
?id,
?response_type,
error = ?e,
"Received lookup download failure"
);
request_state.on_download_failure(id.req_id)?;
@@ -499,6 +556,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
/* Error responses */
#[instrument(parent = None,
level = "info",
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub fn peer_disconnected(&mut self, peer_id: &PeerId) {
for (_, lookup) in self.single_block_lookups.iter_mut() {
lookup.remove_peer(peer_id);
@@ -507,6 +570,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
/* Processing responses */
#[instrument(parent = None,
level = "info",
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub fn on_processing_result(
&mut self,
process_type: BlockProcessType,
@@ -527,6 +596,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
self.on_lookup_result(process_type.id(), lookup_result, "processing_result", cx);
}
#[instrument(parent = None,
level = "info",
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub fn on_processing_result_inner<R: RequestState<T>>(
&mut self,
lookup_id: SingleLookupId,
@@ -534,7 +609,7 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
cx: &mut SyncNetworkContext<T>,
) -> Result<LookupResult, LookupRequestError> {
let Some(lookup) = self.single_block_lookups.get_mut(&lookup_id) else {
debug!(self.log, "Unknown single block lookup"; "id" => lookup_id);
debug!(id = lookup_id, "Unknown single block lookup");
return Err(LookupRequestError::UnknownLookup);
};
@@ -544,12 +619,11 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
.get_state_mut();
debug!(
self.log,
"Received lookup processing result";
"component" => ?R::response_type(),
"block_root" => ?block_root,
"id" => lookup_id,
"result" => ?result,
component = ?R::response_type(),
?block_root,
id = lookup_id,
?result,
"Received lookup processing result"
);
let action = match result {
@@ -581,20 +655,15 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
BlockProcessingResult::Err(BlockError::DuplicateImportStatusUnknown(..)) => {
// This is unreachable because RPC blocks do not undergo gossip verification, and
// this error can *only* come from gossip verification.
error!(
self.log,
"Single block lookup hit unreachable condition";
"block_root" => ?block_root
);
error!(?block_root, "Single block lookup hit unreachable condition");
Action::Drop
}
BlockProcessingResult::Ignored => {
// Beacon processor signalled to ignore the block processing result.
// This implies that the cpu is overloaded. Drop the request.
warn!(
self.log,
"Lookup component processing ignored, cpu might be overloaded";
"component" => ?R::response_type(),
component = ?R::response_type(),
"Lookup component processing ignored, cpu might be overloaded"
);
Action::Drop
}
@@ -602,7 +671,7 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
match e {
BlockError::BeaconChainError(e) => {
// Internal error
error!(self.log, "Beacon chain error processing lookup component"; "block_root" => %block_root, "error" => ?e);
error!(%block_root, error = ?e, "Beacon chain error processing lookup component");
Action::Drop
}
BlockError::ParentUnknown { parent_root, .. } => {
@@ -618,10 +687,9 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
// These errors indicate that the execution layer is offline
// and failed to validate the execution payload. Do not downscore peer.
debug!(
self.log,
"Single block lookup failed. Execution layer is offline / unsynced / misconfigured";
"block_root" => ?block_root,
"error" => ?e
?block_root,
error = ?e,
"Single block lookup failed. Execution layer is offline / unsynced / misconfigured"
);
Action::Drop
}
@@ -629,7 +697,7 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
if e.category() == AvailabilityCheckErrorCategory::Internal =>
{
// There errors indicate internal problems and should not downscore the peer
warn!(self.log, "Internal availability check failure"; "block_root" => ?block_root, "error" => ?e);
warn!(?block_root, error = ?e, "Internal availability check failure");
// Here we choose *not* to call `on_processing_failure` because this could result in a bad
// lookup state transition. This error invalidates both blob and block requests, and we don't know the
@@ -638,7 +706,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
Action::Drop
}
other => {
debug!(self.log, "Invalid lookup component"; "block_root" => ?block_root, "component" => ?R::response_type(), "error" => ?other);
debug!(
?block_root,
component = ?R::response_type(),
error = ?other,
"Invalid lookup component"
);
let peer_group = request_state.on_processing_failure()?;
let peers_to_penalize: Vec<_> = match other {
// Note: currenlty only InvalidColumn errors have index granularity,
@@ -685,7 +758,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
Action::ParentUnknown { parent_root } => {
let peers = lookup.all_peers();
lookup.set_awaiting_parent(parent_root);
debug!(self.log, "Marking lookup as awaiting parent"; "id" => lookup.id, "block_root" => ?block_root, "parent_root" => ?parent_root);
debug!(
id = lookup.id,
?block_root,
?parent_root,
"Marking lookup as awaiting parent"
);
self.search_parent_of_child(parent_root, block_root, &peers, cx);
Ok(LookupResult::Pending)
}
@@ -700,6 +778,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
}
}
#[instrument(parent = None,
level = "info",
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub fn on_external_processing_result(
&mut self,
block_root: Hash256,
@@ -725,13 +809,24 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
}
/// Makes progress on the immediate children of `block_root`
#[instrument(parent = None,
level = "info",
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub fn continue_child_lookups(&mut self, block_root: Hash256, cx: &mut SyncNetworkContext<T>) {
let mut lookup_results = vec![]; // < need to buffer lookup results to not re-borrow &mut self
for (id, lookup) in self.single_block_lookups.iter_mut() {
if lookup.awaiting_parent() == Some(block_root) {
lookup.resolve_awaiting_parent();
debug!(self.log, "Continuing child lookup"; "parent_root" => ?block_root, "id" => id, "block_root" => ?lookup.block_root());
debug!(
parent_root = ?block_root,
id,
block_root = ?lookup.block_root(),
"Continuing child lookup"
);
let result = lookup.continue_requests(cx);
lookup_results.push((*id, result));
}
@@ -745,12 +840,19 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
/// Drops `dropped_id` lookup and all its children recursively. Lookups awaiting a parent need
/// the parent to make progress to resolve, therefore we must drop them if the parent is
/// dropped.
#[instrument(parent = None,
level = "info",
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub fn drop_lookup_and_children(&mut self, dropped_id: SingleLookupId) {
if let Some(dropped_lookup) = self.single_block_lookups.remove(&dropped_id) {
debug!(self.log, "Dropping lookup";
"id" => ?dropped_id,
"block_root" => ?dropped_lookup.block_root(),
"awaiting_parent" => ?dropped_lookup.awaiting_parent(),
debug!(
id = ?dropped_id,
block_root = ?dropped_lookup.block_root(),
awaiting_parent = ?dropped_lookup.awaiting_parent(),
"Dropping lookup"
);
let child_lookups = self
@@ -768,6 +870,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
/// Common handler a lookup request error, drop it and update metrics
/// Returns true if the lookup is created or already exists
#[instrument(parent = None,
level = "info",
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
fn on_lookup_result(
&mut self,
id: SingleLookupId,
@@ -779,13 +887,13 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
Ok(LookupResult::Pending) => true, // no action
Ok(LookupResult::Completed) => {
if let Some(lookup) = self.single_block_lookups.remove(&id) {
debug!(self.log, "Dropping completed lookup"; "block" => ?lookup.block_root(), "id" => id);
debug!(block = ?lookup.block_root(), id, "Dropping completed lookup");
metrics::inc_counter(&metrics::SYNC_LOOKUP_COMPLETED);
// Block imported, continue the requests of pending child blocks
self.continue_child_lookups(lookup.block_root(), cx);
self.update_metrics();
} else {
debug!(self.log, "Attempting to drop non-existent lookup"; "id" => id);
debug!(id, "Attempting to drop non-existent lookup");
}
false
}
@@ -793,7 +901,7 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
// update metrics because the lookup does not exist.
Err(LookupRequestError::UnknownLookup) => false,
Err(error) => {
debug!(self.log, "Dropping lookup on request error"; "id" => id, "source" => source, "error" => ?error);
debug!(id, source, ?error, "Dropping lookup on request error");
metrics::inc_counter_vec(&metrics::SYNC_LOOKUP_DROPPED, &[error.into()]);
self.drop_lookup_and_children(id);
self.update_metrics();
@@ -805,12 +913,24 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
/* Helper functions */
/// Drops all the single block requests and returns how many requests were dropped.
#[instrument(parent = None,
level = "info",
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub fn drop_single_block_requests(&mut self) -> usize {
let requests_to_drop = self.single_block_lookups.len();
self.single_block_lookups.clear();
requests_to_drop
}
#[instrument(parent = None,
level = "info",
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub fn update_metrics(&self) {
metrics::set_gauge(
&metrics::SYNC_SINGLE_BLOCK_LOOKUPS,
@@ -819,6 +939,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
}
/// Perform some prune operations on lookups on some interval
#[instrument(parent = None,
level = "info",
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
pub fn prune_lookups(&mut self) {
self.drop_lookups_without_peers();
self.drop_stuck_lookups();
@@ -842,6 +968,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
///
/// Instead there's no negative for keeping lookups with no peers around for some time. If we
/// regularly prune them, it should not be a memory concern (TODO: maybe yes!).
#[instrument(parent = None,
level = "info",
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
fn drop_lookups_without_peers(&mut self) {
for (lookup_id, block_root) in self
.single_block_lookups
@@ -857,9 +989,10 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
.map(|lookup| (lookup.id, lookup.block_root()))
.collect::<Vec<_>>()
{
debug!(self.log, "Dropping lookup with no peers";
"id" => lookup_id,
"block_root" => ?block_root
debug!(
id = lookup_id,
%block_root,
"Dropping lookup with no peers"
);
self.drop_lookup_and_children(lookup_id);
}
@@ -878,6 +1011,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
///
/// - One single clear warn level log per stuck incident
/// - If the original bug is sporadic, it reduces the time a node is stuck from forever to 15 min
#[instrument(parent = None,
level = "info",
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
fn drop_stuck_lookups(&mut self) {
// While loop to find and drop all disjoint trees of potentially stuck lookups.
while let Some(stuck_lookup) = self.single_block_lookups.values().find(|lookup| {
@@ -886,7 +1025,7 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
let ancestor_stuck_lookup = match self.find_oldest_ancestor_lookup(stuck_lookup) {
Ok(lookup) => lookup,
Err(e) => {
warn!(self.log, "Error finding oldest ancestor lookup"; "error" => ?e);
warn!(error = ?e,"Error finding oldest ancestor lookup");
// Default to dropping the lookup that exceeds the max duration so at least
// eventually sync should be unstuck
stuck_lookup
@@ -894,16 +1033,18 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
};
if stuck_lookup.id == ancestor_stuck_lookup.id {
warn!(self.log, "Notify the devs a sync lookup is stuck";
"block_root" => ?stuck_lookup.block_root(),
"lookup" => ?stuck_lookup,
warn!(
block_root = ?stuck_lookup.block_root(),
lookup = ?stuck_lookup,
"Notify the devs a sync lookup is stuck"
);
} else {
warn!(self.log, "Notify the devs a sync lookup is stuck";
"block_root" => ?stuck_lookup.block_root(),
"lookup" => ?stuck_lookup,
"ancestor_block_root" => ?ancestor_stuck_lookup.block_root(),
"ancestor_lookup" => ?ancestor_stuck_lookup,
warn!(
block_root = ?stuck_lookup.block_root(),
lookup = ?stuck_lookup,
ancestor_block_root = ?ancestor_stuck_lookup.block_root(),
ancestor_lookup = ?ancestor_stuck_lookup,
"Notify the devs a sync lookup is stuck"
);
}
@@ -913,6 +1054,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
}
/// Recursively find the oldest ancestor lookup of another lookup
#[instrument(parent = None,
level = "info",
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
fn find_oldest_ancestor_lookup<'a>(
&'a self,
lookup: &'a SingleBlockLookup<T>,
@@ -937,6 +1084,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
/// Adds peers to a lookup and its ancestors recursively.
/// Note: Takes a `lookup_id` as argument to allow recursion on mutable lookups, without having
/// to duplicate the code to add peers to a lookup
#[instrument(parent = None,
level = "info",
fields(service = "lookup_sync"),
name = "lookup_sync",
skip_all
)]
fn add_peers_to_lookup_and_ancestors(
&mut self,
lookup_id: SingleLookupId,
@@ -952,9 +1105,10 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
for peer in peers {
if lookup.add_peer(*peer) {
added_some_peer = true;
debug!(self.log, "Adding peer to existing single block lookup";
"block_root" => ?lookup.block_root(),
"peer" => ?peer
debug!(
block_root = ?lookup.block_root(),
?peer,
"Adding peer to existing single block lookup"
);
}
}

View File

@@ -63,12 +63,13 @@ use lighthouse_network::service::api_types::{
use lighthouse_network::types::{NetworkGlobals, SyncState};
use lighthouse_network::SyncInfo;
use lighthouse_network::{PeerAction, PeerId};
use logging::crit;
use lru_cache::LRUTimeCache;
use slog::{crit, debug, error, info, o, trace, warn, Logger};
use std::ops::Sub;
use std::sync::Arc;
use std::time::Duration;
use tokio::sync::mpsc;
use tracing::{debug, error, info, info_span, trace, warn, Instrument};
use types::{
BlobSidecar, DataColumnSidecar, EthSpec, ForkContext, Hash256, SignedBeaconBlock, Slot,
};
@@ -246,9 +247,6 @@ pub struct SyncManager<T: BeaconChainTypes> {
notified_unknown_roots: LRUTimeCache<(PeerId, Hash256)>,
sampling: Sampling<T>,
/// The logger for the import manager.
log: Logger,
}
/// Spawns a new `SyncManager` thread which has a weak reference to underlying beacon
@@ -261,7 +259,6 @@ pub fn spawn<T: BeaconChainTypes>(
beacon_processor: Arc<NetworkBeaconProcessor<T>>,
sync_recv: mpsc::UnboundedReceiver<SyncMessage<T::EthSpec>>,
fork_context: Arc<ForkContext>,
log: slog::Logger,
) {
assert!(
beacon_chain.spec.max_request_blocks(fork_context.current_fork()) as u64 >= T::EthSpec::slots_per_epoch() * EPOCHS_PER_BATCH,
@@ -276,12 +273,18 @@ pub fn spawn<T: BeaconChainTypes>(
sync_recv,
SamplingConfig::Default,
fork_context,
log.clone(),
);
// spawn the sync manager thread
debug!(log, "Sync Manager started");
executor.spawn(async move { Box::pin(sync_manager.main()).await }, "sync");
debug!("Sync Manager started");
executor.spawn(
async move {
Box::pin(sync_manager.main())
.instrument(info_span!("", service = "sync"))
.await
},
"sync",
);
}
impl<T: BeaconChainTypes> SyncManager<T> {
@@ -292,7 +295,6 @@ impl<T: BeaconChainTypes> SyncManager<T> {
sync_recv: mpsc::UnboundedReceiver<SyncMessage<T::EthSpec>>,
sampling_config: SamplingConfig,
fork_context: Arc<ForkContext>,
log: slog::Logger,
) -> Self {
let network_globals = beacon_processor.network_globals.clone();
Self {
@@ -303,23 +305,14 @@ impl<T: BeaconChainTypes> SyncManager<T> {
beacon_processor.clone(),
beacon_chain.clone(),
fork_context.clone(),
log.clone(),
),
range_sync: RangeSync::new(
beacon_chain.clone(),
log.new(o!("service" => "range_sync")),
),
backfill_sync: BackFillSync::new(
beacon_chain.clone(),
network_globals,
log.new(o!("service" => "backfill_sync")),
),
block_lookups: BlockLookups::new(log.new(o!("service"=> "lookup_sync"))),
range_sync: RangeSync::new(beacon_chain.clone()),
backfill_sync: BackFillSync::new(beacon_chain.clone(), network_globals),
block_lookups: BlockLookups::new(),
notified_unknown_roots: LRUTimeCache::new(Duration::from_secs(
NOTIFIED_UNKNOWN_ROOT_EXPIRY_SECONDS,
)),
sampling: Sampling::new(sampling_config, log.new(o!("service" => "sampling"))),
log: log.clone(),
sampling: Sampling::new(sampling_config),
}
}
@@ -461,10 +454,10 @@ impl<T: BeaconChainTypes> SyncManager<T> {
};
let head_slot = head_slot.unwrap_or_else(|| {
debug!(self.log,
"On add peers force range sync assuming local head_slot";
"local_head_slot" => local.head_slot,
"head_root" => ?head_root
debug!(
local_head_slot = %local.head_slot,
?head_root,
"On add peers force range sync assuming local head_slot"
);
local.head_slot
});
@@ -485,7 +478,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
/// Handles RPC errors related to requests that were emitted from the sync manager.
fn inject_error(&mut self, peer_id: PeerId, request_id: SyncRequestId, error: RPCError) {
trace!(self.log, "Sync manager received a failed RPC");
trace!("Sync manager received a failed RPC");
match request_id {
SyncRequestId::SingleBlock { id } => {
self.on_single_block_response(id, peer_id, RpcEvent::RPCError(error))
@@ -565,15 +558,14 @@ impl<T: BeaconChainTypes> SyncManager<T> {
let is_connected = self.network_globals().peers.read().is_connected(peer_id);
if was_updated {
debug!(
self.log,
"Peer transitioned sync state";
"peer_id" => %peer_id,
"new_state" => rpr,
"our_head_slot" => local_sync_info.head_slot,
"our_finalized_epoch" => local_sync_info.finalized_epoch,
"their_head_slot" => remote_sync_info.head_slot,
"their_finalized_epoch" => remote_sync_info.finalized_epoch,
"is_connected" => is_connected
%peer_id,
new_state = rpr,
our_head_slot = %local_sync_info.head_slot,
our_finalized_epoch = %local_sync_info.finalized_epoch,
their_head_slot = %remote_sync_info.head_slot,
their_finalized_epoch = %remote_sync_info.finalized_epoch,
is_connected,
"Peer transitioned sync state"
);
// A peer has transitioned its sync state. If the new state is "synced" we
@@ -584,7 +576,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
}
is_connected
} else {
error!(self.log, "Status'd peer is unknown"; "peer_id" => %peer_id);
error!(%peer_id, "Status'd peer is unknown");
false
}
}
@@ -603,7 +595,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
fn update_sync_state(&mut self) {
let new_state: SyncState = match self.range_sync.state() {
Err(e) => {
crit!(self.log, "Error getting range sync state"; "error" => %e);
crit!(error = %e, "Error getting range sync state");
return;
}
Ok(state) => match state {
@@ -652,7 +644,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
}
Ok(SyncStart::NotSyncing) => {} // Ignore updating the state if the backfill sync state didn't start.
Err(e) => {
error!(self.log, "Backfill sync failed to start"; "error" => ?e);
error!(error = ?e, "Backfill sync failed to start");
}
}
}
@@ -686,7 +678,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
let old_state = self.network_globals().set_sync_state(new_state);
let new_state = self.network_globals().sync_state.read().clone();
if !new_state.eq(&old_state) {
info!(self.log, "Sync state updated"; "old_state" => %old_state, "new_state" => %new_state);
info!(%old_state, %new_state, "Sync state updated");
// If we have become synced - Subscribe to all the core subnet topics
// We don't need to subscribe if the old state is a state that would have already
// invoked this call.
@@ -781,7 +773,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
SyncMessage::UnknownParentBlock(peer_id, block, block_root) => {
let block_slot = block.slot();
let parent_root = block.parent_root();
debug!(self.log, "Received unknown parent block message"; "block_root" => %block_root, "parent_root" => %parent_root);
debug!(%block_root, %parent_root, "Received unknown parent block message");
self.handle_unknown_parent(
peer_id,
block_root,
@@ -799,7 +791,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
let blob_slot = blob.slot();
let block_root = blob.block_root();
let parent_root = blob.block_parent_root();
debug!(self.log, "Received unknown parent blob message"; "block_root" => %block_root, "parent_root" => %parent_root);
debug!(%block_root, %parent_root, "Received unknown parent blob message");
self.handle_unknown_parent(
peer_id,
block_root,
@@ -817,7 +809,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
let data_column_slot = data_column.slot();
let block_root = data_column.block_root();
let parent_root = data_column.block_parent_root();
debug!(self.log, "Received unknown parent data column message"; "block_root" => %block_root, "parent_root" => %parent_root);
debug!(%block_root, %parent_root, "Received unknown parent data column message");
self.handle_unknown_parent(
peer_id,
block_root,
@@ -834,12 +826,12 @@ impl<T: BeaconChainTypes> SyncManager<T> {
SyncMessage::UnknownBlockHashFromAttestation(peer_id, block_root) => {
if !self.notified_unknown_roots.contains(&(peer_id, block_root)) {
self.notified_unknown_roots.insert((peer_id, block_root));
debug!(self.log, "Received unknown block hash message"; "block_root" => ?block_root, "peer" => ?peer_id);
debug!(?block_root, ?peer_id, "Received unknown block hash message");
self.handle_unknown_block_root(peer_id, block_root);
}
}
SyncMessage::SampleBlock(block_root, block_slot) => {
debug!(self.log, "Received SampleBlock message"; "block_root" => %block_root, "slot" => block_slot);
debug!(%block_root, slot = %block_slot, "Received SampleBlock message");
if let Some((requester, result)) = self
.sampling
.on_new_sample_request(block_root, &mut self.network)
@@ -848,7 +840,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
}
}
SyncMessage::Disconnect(peer_id) => {
debug!(self.log, "Received disconnected message"; "peer_id" => %peer_id);
debug!(%peer_id, "Received disconnected message");
self.peer_disconnect(&peer_id);
}
SyncMessage::RpcError {
@@ -889,7 +881,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
Ok(ProcessResult::Successful) => {}
Ok(ProcessResult::SyncCompleted) => self.update_sync_state(),
Err(error) => {
error!(self.log, "Backfill sync failed"; "error" => ?error);
error!(error = ?error, "Backfill sync failed");
// Update the global status
self.update_sync_state();
}
@@ -925,7 +917,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
);
}
Err(reason) => {
debug!(self.log, "Ignoring unknown parent request"; "block_root" => %block_root, "parent_root" => %parent_root, "reason" => reason);
debug!(%block_root, %parent_root, reason, "Ignoring unknown parent request");
}
}
}
@@ -937,7 +929,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
.search_unknown_block(block_root, &[peer_id], &mut self.network);
}
Err(reason) => {
debug!(self.log, "Ignoring unknown block request"; "block_root" => %block_root, "reason" => reason);
debug!(%block_root, reason, "Ignoring unknown block request");
}
}
}
@@ -1015,8 +1007,9 @@ impl<T: BeaconChainTypes> SyncManager<T> {
// Some logs.
if dropped_single_blocks_requests > 0 {
debug!(self.log, "Execution engine not online. Dropping active requests.";
"dropped_single_blocks_requests" => dropped_single_blocks_requests,
debug!(
dropped_single_blocks_requests,
"Execution engine not online. Dropping active requests."
);
}
}
@@ -1042,7 +1035,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
RpcEvent::from_chunk(block, seen_timestamp),
),
_ => {
crit!(self.log, "bad request id for block"; "peer_id" => %peer_id );
crit!(%peer_id, "bad request id for block");
}
}
}
@@ -1084,7 +1077,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
RpcEvent::from_chunk(blob, seen_timestamp),
),
_ => {
crit!(self.log, "bad request id for blob"; "peer_id" => %peer_id);
crit!(%peer_id, "bad request id for blob");
}
}
}
@@ -1110,7 +1103,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
RpcEvent::from_chunk(data_column, seen_timestamp),
),
_ => {
crit!(self.log, "bad request id for data_column"; "peer_id" => %peer_id);
crit!(%peer_id, "bad request id for data_column");
}
}
}
@@ -1228,7 +1221,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
fn on_sampling_result(&mut self, requester: SamplingRequester, result: SamplingResult) {
match requester {
SamplingRequester::ImportedBlock(block_root) => {
debug!(self.log, "Sampling result"; "block_root" => %block_root, "result" => ?result);
debug!(%block_root, ?result, "Sampling result");
match result {
Ok(_) => {
@@ -1239,11 +1232,11 @@ impl<T: BeaconChainTypes> SyncManager<T> {
.beacon_processor()
.send_sampling_completed(block_root)
{
warn!(self.log, "Error sending sampling result"; "block_root" => ?block_root, "reason" => ?e);
warn!(?block_root, reason = ?e, "Error sending sampling result");
}
}
Err(e) => {
warn!(self.log, "Sampling failed"; "block_root" => %block_root, "reason" => ?e);
warn!(?block_root, reason = ?e, "Sampling failed");
}
}
}

View File

@@ -34,13 +34,13 @@ use requests::{
ActiveRequests, BlobsByRangeRequestItems, BlobsByRootRequestItems, BlocksByRangeRequestItems,
BlocksByRootRequestItems, DataColumnsByRangeRequestItems, DataColumnsByRootRequestItems,
};
use slog::{debug, error, warn};
use std::collections::hash_map::Entry;
use std::collections::{HashMap, HashSet};
use std::fmt::Debug;
use std::sync::Arc;
use std::time::Duration;
use tokio::sync::mpsc;
use tracing::{debug, error, span, warn, Level};
use types::blob_sidecar::FixedBlobSidecarList;
use types::{
BlobSidecar, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, EthSpec, ForkContext,
@@ -74,10 +74,10 @@ pub type CustodyByRootResult<T> =
#[derive(Debug)]
pub enum RpcResponseError {
RpcError(RPCError),
RpcError(#[allow(dead_code)] RPCError),
VerifyError(LookupVerifyError),
CustodyRequestError(CustodyRequestError),
BlockComponentCouplingError(String),
CustodyRequestError(#[allow(dead_code)] CustodyRequestError),
BlockComponentCouplingError(#[allow(dead_code)] String),
}
#[derive(Debug, PartialEq, Eq)]
@@ -89,6 +89,19 @@ pub enum RpcRequestSendError {
SlotClockError,
}
impl std::fmt::Display for RpcRequestSendError {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
RpcRequestSendError::NetworkSendError => write!(f, "Network send error"),
RpcRequestSendError::NoCustodyPeers => write!(f, "No custody peers"),
RpcRequestSendError::CustodyRequestError(e) => {
write!(f, "Custody request error: {:?}", e)
}
RpcRequestSendError::SlotClockError => write!(f, "Slot clock error"),
}
}
}
#[derive(Debug, PartialEq, Eq)]
pub enum SendErrorProcessor {
SendError,
@@ -201,9 +214,6 @@ pub struct SyncNetworkContext<T: BeaconChainTypes> {
pub chain: Arc<BeaconChain<T>>,
fork_context: Arc<ForkContext>,
/// Logger for the `SyncNetworkContext`.
pub log: slog::Logger,
}
/// Small enumeration to make dealing with block and blob requests easier.
@@ -219,8 +229,13 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
network_beacon_processor: Arc<NetworkBeaconProcessor<T>>,
chain: Arc<BeaconChain<T>>,
fork_context: Arc<ForkContext>,
log: slog::Logger,
) -> Self {
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
SyncNetworkContext {
network_send,
execution_engine_state: EngineState::Online, // always assume `Online` at the start
@@ -236,7 +251,6 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
network_beacon_processor,
chain,
fork_context,
log,
}
}
@@ -267,7 +281,6 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
network_beacon_processor: _,
chain: _,
fork_context: _,
log: _,
} = self;
let blocks_by_root_ids = blocks_by_root_requests
@@ -330,17 +343,23 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
}
pub fn status_peers<C: ToStatusMessage>(&self, chain: &C, peers: impl Iterator<Item = PeerId>) {
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
let status_message = chain.status_message();
for peer_id in peers {
debug!(
self.log,
"Sending Status Request";
"peer" => %peer_id,
"fork_digest" => ?status_message.fork_digest,
"finalized_root" => ?status_message.finalized_root,
"finalized_epoch" => ?status_message.finalized_epoch,
"head_root" => %status_message.head_root,
"head_slot" => %status_message.head_slot,
peer = %peer_id,
fork_digest = ?status_message.fork_digest,
finalized_root = ?status_message.finalized_root,
finalized_epoch = ?status_message.finalized_epoch,
head_root = %status_message.head_root,
head_slot = %status_message.head_slot,
"Sending Status Request"
);
let request = RequestType::Status(status_message.clone());
@@ -385,7 +404,6 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
let (expects_columns, data_column_requests) =
if matches!(batch_type, ByRangeRequestType::BlocksAndColumns) {
let column_indexes = self.network_globals().sampling_columns.clone();
let data_column_requests = self
.make_columns_by_range_requests(request, &column_indexes)?
.into_iter()
@@ -518,6 +536,13 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
return Ok(LookupRequestResult::Pending("no peers"));
};
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
match self.chain.get_block_process_status(&block_root) {
// Unknown block, continue request to download
BlockProcessStatus::Unknown => {}
@@ -560,12 +585,11 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
.map_err(|_| RpcRequestSendError::NetworkSendError)?;
debug!(
self.log,
"Sync RPC request sent";
"method" => "BlocksByRoot",
"block_root" => ?block_root,
"peer" => %peer_id,
"id" => %id
method = "BlocksByRoot",
?block_root,
peer = %peer_id,
%id,
"Sync RPC request sent"
);
self.blocks_by_root_requests.insert(
@@ -608,6 +632,13 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
return Ok(LookupRequestResult::Pending("no peers"));
};
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
let imported_blob_indexes = self
.chain
.data_availability_checker
@@ -643,13 +674,12 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
.map_err(|_| RpcRequestSendError::NetworkSendError)?;
debug!(
self.log,
"Sync RPC request sent";
"method" => "BlobsByRoot",
"block_root" => ?block_root,
"blob_indices" => ?indices,
"peer" => %peer_id,
"id" => %id
method = "BlobsByRoot",
?block_root,
blob_indices = ?indices,
peer = %peer_id,
%id,
"Sync RPC request sent"
);
self.blobs_by_root_requests.insert(
@@ -673,6 +703,13 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
request: DataColumnsByRootSingleBlockRequest,
expect_max_responses: bool,
) -> Result<LookupRequestResult<DataColumnsByRootRequestId>, &'static str> {
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
let id = DataColumnsByRootRequestId {
id: self.next_id(),
requester,
@@ -685,13 +722,12 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
})?;
debug!(
self.log,
"Sync RPC request sent";
"method" => "DataColumnsByRoot",
"block_root" => ?request.block_root,
"indices" => ?request.indices,
"peer" => %peer_id,
"id" => %id,
method = "DataColumnsByRoot",
block_root = ?request.block_root,
indices = ?request.indices,
peer = %peer_id,
%id,
"Sync RPC request sent"
);
self.data_columns_by_root_requests.insert(
@@ -714,6 +750,13 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
block_root: Hash256,
lookup_peers: Arc<RwLock<HashSet<PeerId>>>,
) -> Result<LookupRequestResult, RpcRequestSendError> {
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
let custody_indexes_imported = self
.chain
.data_availability_checker
@@ -740,11 +783,10 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
};
debug!(
self.log,
"Starting custody columns request";
"block_root" => ?block_root,
"indices" => ?custody_indexes_to_fetch,
"id" => %id
?block_root,
indices = ?custody_indexes_to_fetch,
%id,
"Starting custody columns request"
);
let requester = CustodyRequester(id);
@@ -753,7 +795,6 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
CustodyId { requester },
&custody_indexes_to_fetch,
lookup_peers,
self.log.clone(),
);
// Note that you can only send, but not handle a response here
@@ -788,13 +829,12 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
.map_err(|_| RpcRequestSendError::NetworkSendError)?;
debug!(
self.log,
"Sync RPC request sent";
"method" => "BlocksByRange",
"slots" => request.count(),
"epoch" => Slot::new(*request.start_slot()).epoch(T::EthSpec::slots_per_epoch()),
"peer" => %peer_id,
"id" => %id,
method = "BlocksByRange",
slots = request.count(),
epoch = %Slot::new(*request.start_slot()).epoch(T::EthSpec::slots_per_epoch()),
peer = %peer_id,
%id,
"Sync RPC request sent"
);
self.blocks_by_range_requests.insert(
@@ -830,13 +870,12 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
.map_err(|_| RpcRequestSendError::NetworkSendError)?;
debug!(
self.log,
"Sync RPC request sent";
"method" => "BlobsByRange",
"slots" => request.count,
"epoch" => request_epoch,
"peer" => %peer_id,
"id" => %id,
method = "BlobsByRange",
slots = request.count,
epoch = %request_epoch,
peer = %peer_id,
%id,
"Sync RPC request sent"
);
let max_blobs_per_block = self.chain.spec.max_blobs_per_block(request_epoch);
@@ -870,14 +909,13 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
.map_err(|_| RpcRequestSendError::NetworkSendError)?;
debug!(
self.log,
"Sync RPC request sent";
"method" => "DataColumnsByRange",
"slots" => request.count,
"epoch" => Slot::new(request.start_slot).epoch(T::EthSpec::slots_per_epoch()),
"columns" => ?request.columns,
"peer" => %peer_id,
"id" => %id,
method = "DataColumnsByRange",
slots = request.count,
epoch = %Slot::new(request.start_slot).epoch(T::EthSpec::slots_per_epoch()),
columns = ?request.columns,
peer = %peer_id,
%id,
"Sync RPC request sent"
);
self.data_columns_by_range_requests.insert(
@@ -896,13 +934,26 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
}
pub fn update_execution_engine_state(&mut self, engine_state: EngineState) {
debug!(self.log, "Sync's view on execution engine state updated";
"past_state" => ?self.execution_engine_state, "new_state" => ?engine_state);
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
debug!(past_state = ?self.execution_engine_state, new_state = ?engine_state, "Sync's view on execution engine state updated");
self.execution_engine_state = engine_state;
}
/// Terminates the connection with the peer and bans them.
pub fn goodbye_peer(&mut self, peer_id: PeerId, reason: GoodbyeReason) {
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
self.network_send
.send(NetworkMessage::GoodbyePeer {
peer_id,
@@ -910,13 +961,20 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
source: ReportSource::SyncService,
})
.unwrap_or_else(|_| {
warn!(self.log, "Could not report peer: channel failed");
warn!("Could not report peer: channel failed");
});
}
/// Reports to the scoring algorithm the behaviour of a peer.
pub fn report_peer(&self, peer_id: PeerId, action: PeerAction, msg: &'static str) {
debug!(self.log, "Sync reporting peer"; "peer_id" => %peer_id, "action" => %action, "msg" => %msg);
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
debug!(%peer_id, %action, %msg, "Sync reporting peer");
self.network_send
.send(NetworkMessage::ReportPeer {
peer_id,
@@ -925,23 +983,37 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
msg,
})
.unwrap_or_else(|e| {
warn!(self.log, "Could not report peer: channel failed"; "error"=> %e);
warn!(error = %e, "Could not report peer: channel failed");
});
}
/// Subscribes to core topics.
pub fn subscribe_core_topics(&self) {
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
self.network_send
.send(NetworkMessage::SubscribeCoreTopics)
.unwrap_or_else(|e| {
warn!(self.log, "Could not subscribe to core topics."; "error" => %e);
warn!(error = %e, "Could not subscribe to core topics.");
});
}
/// Sends an arbitrary network message.
fn send_network_msg(&self, msg: NetworkMessage<T::EthSpec>) -> Result<(), &'static str> {
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
self.network_send.send(msg).map_err(|_| {
debug!(self.log, "Could not send message to the network service");
debug!("Could not send message to the network service");
"Network channel send Failed"
})
}
@@ -1128,20 +1200,18 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
None => {}
Some(Ok((v, _))) => {
debug!(
self.log,
"Sync RPC request completed";
"id" => %id,
"method" => method,
"count" => get_count(v)
%id,
method,
count = get_count(v),
"Sync RPC request completed"
);
}
Some(Err(e)) => {
debug!(
self.log,
"Sync RPC request error";
"id" => %id,
"method" => method,
"error" => ?e
%id,
method,
error = ?e,
"Sync RPC request error"
);
}
}
@@ -1166,11 +1236,18 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
peer_id: PeerId,
resp: RpcResponseResult<Vec<Arc<DataColumnSidecar<T::EthSpec>>>>,
) -> Option<CustodyByRootResult<T::EthSpec>> {
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
// Note: need to remove the request to borrow self again below. Otherwise we can't
// do nested requests
let Some(mut request) = self.custody_by_root_requests.remove(&id.requester) else {
// TOOD(das): This log can happen if the request is error'ed early and dropped
debug!(self.log, "Custody column downloaded event for unknown request"; "id" => ?id);
debug!(?id, "Custody column downloaded event for unknown request");
return None;
};
@@ -1185,6 +1262,13 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
request: ActiveCustodyRequest<T>,
result: CustodyRequestResult<T::EthSpec>,
) -> Option<CustodyByRootResult<T::EthSpec>> {
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
let result = result
.map_err(RpcResponseError::CustodyRequestError)
.transpose();
@@ -1193,10 +1277,10 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
// an Option first to use in an `if let Some() { act on result }` block.
match result.as_ref() {
Some(Ok((columns, peer_group, _))) => {
debug!(self.log, "Custody request success, removing"; "id" => ?id, "count" => columns.len(), "peers" => ?peer_group)
debug!(?id, count = columns.len(), peers = ?peer_group, "Custody request success, removing")
}
Some(Err(e)) => {
debug!(self.log, "Custody request failure, removing"; "id" => ?id, "error" => ?e)
debug!(?id, error = ?e, "Custody request failure, removing" )
}
None => {
self.custody_by_root_requests.insert(id, request);
@@ -1212,11 +1296,18 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
block: RpcBlock<T::EthSpec>,
seen_timestamp: Duration,
) -> Result<(), SendErrorProcessor> {
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
let beacon_processor = self
.beacon_processor_if_enabled()
.ok_or(SendErrorProcessor::ProcessorNotAvailable)?;
debug!(self.log, "Sending block for processing"; "block" => ?block_root, "id" => id);
debug!(block = ?block_root, id, "Sending block for processing");
// Lookup sync event safety: If `beacon_processor.send_rpc_beacon_block` returns Ok() sync
// must receive a single `SyncMessage::BlockComponentProcessed` with this process type
beacon_processor
@@ -1228,9 +1319,8 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
)
.map_err(|e| {
error!(
self.log,
"Failed to send sync block to processor";
"error" => ?e
error = ?e,
"Failed to send sync block to processor"
);
SendErrorProcessor::SendError
})
@@ -1243,11 +1333,18 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
blobs: FixedBlobSidecarList<T::EthSpec>,
seen_timestamp: Duration,
) -> Result<(), SendErrorProcessor> {
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
let beacon_processor = self
.beacon_processor_if_enabled()
.ok_or(SendErrorProcessor::ProcessorNotAvailable)?;
debug!(self.log, "Sending blobs for processing"; "block" => ?block_root, "id" => id);
debug!(?block_root, ?id, "Sending blobs for processing");
// Lookup sync event safety: If `beacon_processor.send_rpc_blobs` returns Ok() sync
// must receive a single `SyncMessage::BlockComponentProcessed` event with this process type
beacon_processor
@@ -1259,9 +1356,8 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
)
.map_err(|e| {
error!(
self.log,
"Failed to send sync blobs to processor";
"error" => ?e
error = ?e,
"Failed to send sync blobs to processor"
);
SendErrorProcessor::SendError
})
@@ -1275,19 +1371,29 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
seen_timestamp: Duration,
process_type: BlockProcessType,
) -> Result<(), SendErrorProcessor> {
let span = span!(
Level::INFO,
"SyncNetworkContext",
service = "network_context"
);
let _enter = span.enter();
let beacon_processor = self
.beacon_processor_if_enabled()
.ok_or(SendErrorProcessor::ProcessorNotAvailable)?;
debug!(self.log, "Sending custody columns for processing"; "block" => ?block_root, "process_type" => ?process_type);
debug!(
?block_root,
?process_type,
"Sending custody columns for processing"
);
beacon_processor
.send_rpc_custody_columns(block_root, custody_columns, seen_timestamp, process_type)
.map_err(|e| {
error!(
self.log,
"Failed to send sync custody columns to processor";
"error" => ?e
error = ?e,
"Failed to send sync custody columns to processor"
);
SendErrorProcessor::SendError
})

View File

@@ -9,10 +9,10 @@ use lighthouse_network::PeerId;
use lru_cache::LRUTimeCache;
use parking_lot::RwLock;
use rand::Rng;
use slog::{debug, warn};
use std::collections::HashSet;
use std::time::{Duration, Instant};
use std::{collections::HashMap, marker::PhantomData, sync::Arc};
use tracing::{debug, warn};
use types::EthSpec;
use types::{data_column_sidecar::ColumnIndex, DataColumnSidecar, Hash256};
@@ -36,8 +36,7 @@ pub struct ActiveCustodyRequest<T: BeaconChainTypes> {
failed_peers: LRUTimeCache<PeerId>,
/// Set of peers that claim to have imported this block and their custody columns
lookup_peers: Arc<RwLock<HashSet<PeerId>>>,
/// Logger for the `SyncNetworkContext`.
pub log: slog::Logger,
_phantom: PhantomData<T>,
}
@@ -70,7 +69,6 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
custody_id: CustodyId,
column_indices: &[ColumnIndex],
lookup_peers: Arc<RwLock<HashSet<PeerId>>>,
log: slog::Logger,
) -> Self {
Self {
block_root,
@@ -83,7 +81,6 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
active_batch_columns_requests: <_>::default(),
failed_peers: LRUTimeCache::new(Duration::from_secs(FAILED_PEERS_CACHE_EXPIRY_SECONDS)),
lookup_peers,
log,
_phantom: PhantomData,
}
}
@@ -104,24 +101,24 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
cx: &mut SyncNetworkContext<T>,
) -> CustodyRequestResult<T::EthSpec> {
let Some(batch_request) = self.active_batch_columns_requests.get_mut(&req_id) else {
warn!(self.log,
"Received custody column response for unrequested index";
"id" => ?self.custody_id,
"block_root" => ?self.block_root,
"req_id" => %req_id,
warn!(
id = ?self.custody_id,
block_root = ?self.block_root,
%req_id,
"Received custody column response for unrequested index"
);
return Ok(None);
};
match resp {
Ok((data_columns, seen_timestamp)) => {
debug!(self.log,
"Custody column download success";
"id" => ?self.custody_id,
"block_root" => ?self.block_root,
"req_id" => %req_id,
"peer" => %peer_id,
"count" => data_columns.len()
debug!(
id = ?self.custody_id,
block_root = ?self.block_root,
%req_id,
%peer_id,
count = data_columns.len(),
"Custody column download success"
);
// Map columns by index as an optimization to not loop the returned list on each
@@ -163,27 +160,27 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
if !missing_column_indexes.is_empty() {
// Note: Batch logging that columns are missing to not spam logger
debug!(self.log,
"Custody column peer claims to not have some data";
"id" => ?self.custody_id,
"block_root" => ?self.block_root,
"req_id" => %req_id,
"peer" => %peer_id,
debug!(
id = ?self.custody_id,
block_root = ?self.block_root,
%req_id,
%peer_id,
// TODO(das): this property can become very noisy, being the full range 0..128
"missing_column_indexes" => ?missing_column_indexes
?missing_column_indexes,
"Custody column peer claims to not have some data"
);
self.failed_peers.insert(peer_id);
}
}
Err(err) => {
debug!(self.log,
"Custody column download error";
"id" => ?self.custody_id,
"block_root" => ?self.block_root,
"req_id" => %req_id,
"peer" => %peer_id,
"error" => ?err
debug!(
id = ?self.custody_id,
block_root = ?self.block_root,
%req_id,
%peer_id,
error = ?err,
"Custody column download error"
);
// TODO(das): Should mark peer as failed and try from another peer

View File

@@ -12,11 +12,11 @@ use lighthouse_network::service::api_types::{
};
use lighthouse_network::{PeerAction, PeerId};
use rand::{seq::SliceRandom, thread_rng};
use slog::{debug, error, warn};
use std::{
collections::hash_map::Entry, collections::HashMap, marker::PhantomData, sync::Arc,
time::Duration,
};
use tracing::{debug, error, instrument, warn};
use types::{data_column_sidecar::ColumnIndex, ChainSpec, DataColumnSidecar, Hash256};
pub type SamplingResult = Result<(), SamplingError>;
@@ -26,24 +26,35 @@ type DataColumnSidecarList<E> = Vec<Arc<DataColumnSidecar<E>>>;
pub struct Sampling<T: BeaconChainTypes> {
requests: HashMap<SamplingRequester, ActiveSamplingRequest<T>>,
sampling_config: SamplingConfig,
log: slog::Logger,
}
impl<T: BeaconChainTypes> Sampling<T> {
pub fn new(sampling_config: SamplingConfig, log: slog::Logger) -> Self {
#[instrument(parent = None,level = "info", fields(service = "sampling"), name = "sampling")]
pub fn new(sampling_config: SamplingConfig) -> Self {
Self {
requests: <_>::default(),
sampling_config,
log,
}
}
#[cfg(test)]
#[instrument(parent = None,
level = "info",
fields(service = "sampling"),
name = "sampling",
skip_all
)]
pub fn active_sampling_requests(&self) -> Vec<Hash256> {
self.requests.values().map(|r| r.block_root).collect()
}
#[cfg(test)]
#[instrument(parent = None,
level = "info",
fields(service = "sampling"),
name = "sampling",
skip_all
)]
pub fn get_request_status(
&self,
block_root: Hash256,
@@ -61,6 +72,12 @@ impl<T: BeaconChainTypes> Sampling<T> {
///
/// - `Some`: Request completed, won't make more progress. Expect requester to act on the result.
/// - `None`: Request still active, requester should do no action
#[instrument(parent = None,
level = "info",
fields(service = "sampling"),
name = "sampling",
skip_all
)]
pub fn on_new_sample_request(
&mut self,
block_root: Hash256,
@@ -73,7 +90,6 @@ impl<T: BeaconChainTypes> Sampling<T> {
block_root,
id,
&self.sampling_config,
self.log.clone(),
&cx.chain.spec,
)),
Entry::Occupied(_) => {
@@ -82,15 +98,15 @@ impl<T: BeaconChainTypes> Sampling<T> {
// TODO(das): Should track failed sampling request for some time? Otherwise there's
// a risk of a loop with multiple triggers creating the request, then failing,
// and repeat.
debug!(self.log, "Ignoring duplicate sampling request"; "id" => ?id);
debug!(?id, "Ignoring duplicate sampling request");
return None;
}
};
debug!(self.log,
"Created new sample request";
"id" => ?id,
"column_selection" => ?request.column_selection()
debug!(
?id,
column_selection = ?request.column_selection(),
"Created new sample request"
);
// TOOD(das): If a node has very little peers, continue_sampling() will attempt to find enough
@@ -107,6 +123,12 @@ impl<T: BeaconChainTypes> Sampling<T> {
///
/// - `Some`: Request completed, won't make more progress. Expect requester to act on the result.
/// - `None`: Request still active, requester should do no action
#[instrument(parent = None,
level = "info",
fields(service = "sampling"),
name = "sampling",
skip_all
)]
pub fn on_sample_downloaded(
&mut self,
id: SamplingId,
@@ -116,7 +138,7 @@ impl<T: BeaconChainTypes> Sampling<T> {
) -> Option<(SamplingRequester, SamplingResult)> {
let Some(request) = self.requests.get_mut(&id.id) else {
// TOOD(das): This log can happen if the request is error'ed early and dropped
debug!(self.log, "Sample downloaded event for unknown request"; "id" => ?id);
debug!(?id, "Sample downloaded event for unknown request");
return None;
};
@@ -131,6 +153,12 @@ impl<T: BeaconChainTypes> Sampling<T> {
///
/// - `Some`: Request completed, won't make more progress. Expect requester to act on the result.
/// - `None`: Request still active, requester should do no action
#[instrument(parent = None,
level = "info",
fields(service = "sampling"),
name = "sampling",
skip_all
)]
pub fn on_sample_verified(
&mut self,
id: SamplingId,
@@ -139,7 +167,7 @@ impl<T: BeaconChainTypes> Sampling<T> {
) -> Option<(SamplingRequester, SamplingResult)> {
let Some(request) = self.requests.get_mut(&id.id) else {
// TOOD(das): This log can happen if the request is error'ed early and dropped
debug!(self.log, "Sample verified event for unknown request"; "id" => ?id);
debug!(?id, "Sample verified event for unknown request");
return None;
};
@@ -150,6 +178,12 @@ impl<T: BeaconChainTypes> Sampling<T> {
/// Converts a result from the internal format of `ActiveSamplingRequest` (error first to use ?
/// conveniently), to an Option first format to use an `if let Some() { act on result }` pattern
/// in the sync manager.
#[instrument(parent = None,
level = "info",
fields(service = "sampling"),
name = "sampling",
skip_all
)]
fn handle_sampling_result(
&mut self,
result: Result<Option<()>, SamplingError>,
@@ -157,7 +191,7 @@ impl<T: BeaconChainTypes> Sampling<T> {
) -> Option<(SamplingRequester, SamplingResult)> {
let result = result.transpose();
if let Some(result) = result {
debug!(self.log, "Sampling request completed, removing"; "id" => ?id, "result" => ?result);
debug!(?id, ?result, "Sampling request completed, removing");
metrics::inc_counter_vec(
&metrics::SAMPLING_REQUEST_RESULT,
&[metrics::from_result(&result)],
@@ -180,8 +214,6 @@ pub struct ActiveSamplingRequest<T: BeaconChainTypes> {
current_sampling_request_id: SamplingRequestId,
column_shuffle: Vec<ColumnIndex>,
required_successes: Vec<usize>,
/// Logger for the `SyncNetworkContext`.
pub log: slog::Logger,
_phantom: PhantomData<T>,
}
@@ -212,7 +244,6 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
block_root: Hash256,
requester_id: SamplingRequester,
sampling_config: &SamplingConfig,
log: slog::Logger,
spec: &ChainSpec,
) -> Self {
// Select ahead of time the full list of to-sample columns
@@ -232,7 +263,6 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
SamplingConfig::Default => REQUIRED_SUCCESSES.to_vec(),
SamplingConfig::Custom { required_successes } => required_successes.clone(),
},
log,
_phantom: PhantomData,
}
}
@@ -275,9 +305,9 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
.column_indexes_by_sampling_request
.get(&sampling_request_id)
else {
error!(self.log,
"Column indexes for the sampling request ID not found";
"sampling_request_id" => ?sampling_request_id
error!(
?sampling_request_id,
"Column indexes for the sampling request ID not found"
);
return Ok(None);
};
@@ -288,11 +318,11 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
.iter()
.map(|r| r.index)
.collect::<Vec<_>>();
debug!(self.log,
"Sample download success";
"block_root" => %self.block_root,
"column_indexes" => ?resp_column_indexes,
"count" => resp_data_columns.len()
debug!(
block_root = %self.block_root,
column_indexes = ?resp_column_indexes,
count = resp_data_columns.len(),
"Sample download success"
);
metrics::inc_counter_vec(&metrics::SAMPLE_DOWNLOAD_RESULT, &[metrics::SUCCESS]);
@@ -300,10 +330,10 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
let mut data_columns = vec![];
for column_index in column_indexes {
let Some(request) = self.column_requests.get_mut(column_index) else {
warn!(self.log,
"Active column sample request not found";
"block_root" => %self.block_root,
"column_index" => column_index
warn!(
block_root = %self.block_root,
column_index,
"Active column sample request not found"
);
continue;
};
@@ -314,10 +344,10 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
else {
// Peer does not have the requested data, mark peer as "dont have" and try
// again with a different peer.
debug!(self.log,
"Sampling peer claims to not have the data";
"block_root" => %self.block_root,
"column_index" => column_index
debug!(
block_root = %self.block_root,
column_index,
"Sampling peer claims to not have the data"
);
request.on_sampling_error()?;
continue;
@@ -331,16 +361,16 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
.iter()
.map(|d| d.index)
.collect::<Vec<_>>();
debug!(self.log,
"Received data that was not requested";
"block_root" => %self.block_root,
"column_indexes" => ?resp_column_indexes
debug!(
block_root = %self.block_root,
column_indexes = ?resp_column_indexes,
"Received data that was not requested"
);
}
// Handle the downloaded data columns.
if data_columns.is_empty() {
debug!(self.log, "Received empty response"; "block_root" => %self.block_root);
debug!(block_root = %self.block_root, "Received empty response");
self.column_indexes_by_sampling_request
.remove(&sampling_request_id);
} else {
@@ -351,17 +381,17 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
// Peer has data column, send to verify
let Some(beacon_processor) = cx.beacon_processor_if_enabled() else {
// If processor is not available, error the entire sampling
debug!(self.log,
"Dropping sampling";
"block" => %self.block_root,
"reason" => "beacon processor unavailable"
debug!(
block = %self.block_root,
reason = "beacon processor unavailable",
"Dropping sampling"
);
return Err(SamplingError::ProcessorUnavailable);
};
debug!(self.log,
"Sending data_column for verification";
"block" => ?self.block_root,
"column_indexes" => ?column_indexes
debug!(
block = ?self.block_root,
?column_indexes,
"Sending data_column for verification"
);
if let Err(e) = beacon_processor.send_rpc_validate_data_columns(
self.block_root,
@@ -375,20 +405,21 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
// Beacon processor is overloaded, drop sampling attempt. Failing to sample
// is not a permanent state so we should recover once the node has capacity
// and receives a descendant block.
error!(self.log,
"Dropping sampling";
"block" => %self.block_root,
"reason" => e.to_string()
error!(
block = %self.block_root,
reason = e.to_string(),
"Dropping sampling"
);
return Err(SamplingError::SendFailed("beacon processor send failure"));
}
}
}
Err(err) => {
debug!(self.log, "Sample download error";
"block_root" => %self.block_root,
"column_indexes" => ?column_indexes,
"error" => ?err
debug!(
block_root = %self.block_root,
?column_indexes,
error = ?err,
"Sample download error"
);
metrics::inc_counter_vec(&metrics::SAMPLE_DOWNLOAD_RESULT, &[metrics::FAILURE]);
@@ -396,10 +427,10 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
// reaching this function. Mark the peer as failed and try again with another.
for column_index in column_indexes {
let Some(request) = self.column_requests.get_mut(column_index) else {
warn!(self.log,
"Active column sample request not found";
"block_root" => %self.block_root,
"column_index" => column_index
warn!(
block_root = %self.block_root,
column_index,
"Active column sample request not found"
);
continue;
};
@@ -429,21 +460,24 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
.column_indexes_by_sampling_request
.get(&sampling_request_id)
else {
error!(self.log, "Column indexes for the sampling request ID not found"; "sampling_request_id" => ?sampling_request_id);
error!(
?sampling_request_id,
"Column indexes for the sampling request ID not found"
);
return Ok(None);
};
match result {
Ok(_) => {
debug!(self.log, "Sample verification success"; "block_root" => %self.block_root, "column_indexes" => ?column_indexes);
debug!(block_root = %self.block_root,?column_indexes, "Sample verification success");
metrics::inc_counter_vec(&metrics::SAMPLE_VERIFY_RESULT, &[metrics::SUCCESS]);
// Valid, continue_sampling will maybe consider sampling succees
for column_index in column_indexes {
let Some(request) = self.column_requests.get_mut(column_index) else {
warn!(
self.log,
"Active column sample request not found"; "block_root" => %self.block_root, "column_index" => column_index
block_root = %self.block_root, column_index,
"Active column sample request not found"
);
continue;
};
@@ -451,7 +485,7 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
}
}
Err(err) => {
debug!(self.log, "Sample verification failure"; "block_root" => %self.block_root, "column_indexes" => ?column_indexes, "reason" => ?err);
debug!(block_root = %self.block_root, ?column_indexes, reason = ?err, "Sample verification failure");
metrics::inc_counter_vec(&metrics::SAMPLE_VERIFY_RESULT, &[metrics::FAILURE]);
// Peer sent invalid data, penalize and try again from different peer
@@ -459,8 +493,9 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
for column_index in column_indexes {
let Some(request) = self.column_requests.get_mut(column_index) else {
warn!(
self.log,
"Active column sample request not found"; "block_root" => %self.block_root, "column_index" => column_index
block_root = %self.block_root,
column_index,
"Active column sample request not found"
);
continue;
};
@@ -570,7 +605,7 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
// request was sent, loop to increase the required_successes until the sampling fails if
// there are no peers.
if ongoings == 0 && !sent_request {
debug!(self.log, "Sampling request stalled"; "block_root" => %self.block_root);
debug!(block_root = %self.block_root, "Sampling request stalled");
}
Ok(None)

View File

@@ -3,6 +3,7 @@ use lighthouse_network::rpc::methods::BlocksByRangeRequest;
use lighthouse_network::service::api_types::Id;
use lighthouse_network::PeerId;
use std::collections::HashSet;
use std::fmt;
use std::hash::{Hash, Hasher};
use std::ops::Sub;
use std::time::{Duration, Instant};
@@ -61,6 +62,7 @@ pub trait BatchConfig {
fn batch_attempt_hash<E: EthSpec>(blocks: &[RpcBlock<E>]) -> u64;
}
#[derive(Debug)]
pub struct RangeSyncBatchConfig {}
impl BatchConfig for RangeSyncBatchConfig {
@@ -93,6 +95,7 @@ pub enum BatchProcessingResult {
NonFaultyFailure,
}
#[derive(Debug)]
/// A segment of a chain.
pub struct BatchInfo<E: EthSpec, B: BatchConfig = RangeSyncBatchConfig> {
/// Start slot of the batch.
@@ -113,6 +116,17 @@ pub struct BatchInfo<E: EthSpec, B: BatchConfig = RangeSyncBatchConfig> {
marker: std::marker::PhantomData<B>,
}
impl<E: EthSpec, B: BatchConfig> fmt::Display for BatchInfo<E, B> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"Start Slot: {}, End Slot: {}, State: {}",
self.start_slot, self.end_slot, self.state
)
}
}
#[derive(Display)]
/// Current state of a batch
pub enum BatchState<E: EthSpec> {
/// The batch has failed either downloading or processing, but can be requested again.
@@ -190,15 +204,6 @@ impl<E: EthSpec, B: BatchConfig> BatchInfo<E, B> {
peers
}
/// Return the number of times this batch has failed downloading and failed processing, in this
/// order.
pub fn failed_attempts(&self) -> (usize, usize) {
(
self.failed_download_attempts.len(),
self.failed_processing_attempts.len(),
)
}
/// Verifies if an incoming block belongs to this batch.
pub fn is_expecting_block(&self, request_id: &Id) -> bool {
if let BatchState::Downloading(_, expected_id) = &self.state {
@@ -456,39 +461,6 @@ impl Attempt {
}
}
impl<E: EthSpec, B: BatchConfig> slog::KV for &mut BatchInfo<E, B> {
fn serialize(
&self,
record: &slog::Record,
serializer: &mut dyn slog::Serializer,
) -> slog::Result {
slog::KV::serialize(*self, record, serializer)
}
}
impl<E: EthSpec, B: BatchConfig> slog::KV for BatchInfo<E, B> {
fn serialize(
&self,
record: &slog::Record,
serializer: &mut dyn slog::Serializer,
) -> slog::Result {
use slog::Value;
Value::serialize(&self.start_slot, record, "start_slot", serializer)?;
Value::serialize(
&(self.end_slot - 1), // NOTE: The -1 shows inclusive blocks
record,
"end_slot",
serializer,
)?;
serializer.emit_usize("downloaded", self.failed_download_attempts.len())?;
serializer.emit_usize("processed", self.failed_processing_attempts.len())?;
serializer.emit_u8("processed_no_penalty", self.non_faulty_processing_attempts)?;
serializer.emit_arguments("state", &format_args!("{:?}", self.state))?;
serializer.emit_arguments("batch_ty", &format_args!("{}", self.batch_type))?;
slog::Result::Ok(())
}
}
impl<E: EthSpec> std::fmt::Debug for BatchState<E> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {

View File

@@ -9,11 +9,13 @@ use beacon_chain::BeaconChainTypes;
use fnv::FnvHashMap;
use lighthouse_network::service::api_types::Id;
use lighthouse_network::{PeerAction, PeerId};
use logging::crit;
use rand::seq::SliceRandom;
use rand::Rng;
use slog::{crit, debug, o, warn};
use std::collections::{btree_map::Entry, BTreeMap, HashSet};
use std::fmt;
use strum::IntoStaticStr;
use tracing::{debug, instrument, warn};
use types::{Epoch, EthSpec, Hash256, Slot};
/// Blocks are downloaded in batches from peers. This constant specifies how many epochs worth of
@@ -37,6 +39,7 @@ pub type ProcessingResult = Result<KeepChain, RemoveChain>;
/// Reasons for removing a chain
#[derive(Debug)]
#[allow(dead_code)]
pub enum RemoveChain {
EmptyPeerPool,
ChainCompleted,
@@ -66,6 +69,7 @@ pub enum SyncingChainType {
/// A chain of blocks that need to be downloaded. Peers who claim to contain the target head
/// root are grouped into the peer pool and queried for batches when downloading the
/// chain.
#[derive(Debug)]
pub struct SyncingChain<T: BeaconChainTypes> {
/// A random id used to identify this chain.
id: ChainId,
@@ -110,9 +114,16 @@ pub struct SyncingChain<T: BeaconChainTypes> {
/// The current processing batch, if any.
current_processing_batch: Option<BatchId>,
}
/// The chain's log.
log: slog::Logger,
impl<T: BeaconChainTypes> fmt::Display for SyncingChain<T> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self.chain_type {
SyncingChainType::Head => write!(f, "Head"),
SyncingChainType::Finalized => write!(f, "Finalized"),
SyncingChainType::Backfill => write!(f, "Backfill"),
}
}
}
#[derive(PartialEq, Debug)]
@@ -132,7 +143,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
target_head_root: Hash256,
peer_id: PeerId,
chain_type: SyncingChainType,
log: &slog::Logger,
) -> Self {
let mut peers = FnvHashMap::default();
peers.insert(peer_id, Default::default());
@@ -151,7 +161,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
attempted_optimistic_starts: HashSet::default(),
state: ChainSyncingState::Stopped,
current_processing_batch: None,
log: log.new(o!("chain" => id)),
}
}
@@ -161,21 +170,25 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
}
/// Check if the chain has peers from which to process batches.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn available_peers(&self) -> usize {
self.peers.len()
}
/// Get the chain's id.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn get_id(&self) -> ChainId {
self.id
}
/// Peers currently syncing this chain.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn peers(&self) -> impl Iterator<Item = PeerId> + '_ {
self.peers.keys().cloned()
}
/// Progress in epochs made by the chain
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn processed_epochs(&self) -> u64 {
self.processing_target
.saturating_sub(self.start_epoch)
@@ -183,6 +196,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
}
/// Returns the total count of pending blocks in all the batches of this chain
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn pending_blocks(&self) -> usize {
self.batches
.values()
@@ -192,6 +206,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
/// Removes a peer from the chain.
/// If the peer has active batches, those are considered failed and re-requested.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn remove_peer(
&mut self,
peer_id: &PeerId,
@@ -211,8 +226,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
}
self.retry_batch_download(network, id)?;
} else {
debug!(self.log, "Batch not found while removing peer";
"peer" => %peer_id, "batch" => id)
debug!(%peer_id, batch = ?id, "Batch not found while removing peer")
}
}
}
@@ -225,6 +239,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
}
/// Returns the latest slot number that has been processed.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
fn current_processed_slot(&self) -> Slot {
// the last slot we processed was included in the previous batch, and corresponds to the
// first slot of the current target epoch
@@ -234,6 +249,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
/// A block has been received for a batch on this chain.
/// If the block correctly completes the batch it will be processed if possible.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn on_block_response(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -245,7 +261,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// check if we have this batch
let batch = match self.batches.get_mut(&batch_id) {
None => {
debug!(self.log, "Received a block for unknown batch"; "epoch" => batch_id);
debug!(epoch = %batch_id, "Received a block for unknown batch");
// A batch might get removed when the chain advances, so this is non fatal.
return Ok(KeepChain);
}
@@ -273,7 +289,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
let awaiting_batches = batch_id
.saturating_sub(self.optimistic_start.unwrap_or(self.processing_target))
/ EPOCHS_PER_BATCH;
debug!(self.log, "Batch downloaded"; "epoch" => batch_id, "blocks" => received, "batch_state" => self.visualize_batch_state(), "awaiting_batches" => awaiting_batches);
debug!(epoch = %batch_id, blocks = received, batch_state = self.visualize_batch_state(), %awaiting_batches,"Batch downloaded");
// pre-emptively request more blocks from peers whilst we process current blocks,
self.request_batches(network)?;
@@ -282,6 +298,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
/// Processes the batch with the given id.
/// The batch must exist and be ready for processing
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
fn process_batch(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -317,8 +334,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
self.current_processing_batch = Some(batch_id);
if let Err(e) = beacon_processor.send_chain_segment(process_id, blocks) {
crit!(self.log, "Failed to send chain segment to processor."; "msg" => "process_batch",
"error" => %e, "batch" => self.processing_target);
crit!(msg = "process_batch",error = %e, batch = ?self.processing_target, "Failed to send chain segment to processor.");
// This is unlikely to happen but it would stall syncing since the batch now has no
// blocks to continue, and the chain is expecting a processing result that won't
// arrive. To mitigate this, (fake) fail this processing so that the batch is
@@ -330,6 +346,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
}
/// Processes the next ready batch, prioritizing optimistic batches over the processing target.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
fn process_completed_batches(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -349,7 +366,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
match state {
BatchState::AwaitingProcessing(..) => {
// this batch is ready
debug!(self.log, "Processing optimistic start"; "epoch" => epoch);
debug!(%epoch, "Processing optimistic start");
return self.process_batch(network, epoch);
}
BatchState::Downloading(..) => {
@@ -377,7 +394,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// batch has been requested and processed we can land here. We drop the
// optimistic candidate since we can't conclude whether the batch included
// blocks or not at this point
debug!(self.log, "Dropping optimistic candidate"; "batch" => epoch);
debug!(batch = %epoch, "Dropping optimistic candidate");
self.optimistic_start = None;
}
}
@@ -411,7 +428,10 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// inside the download buffer (between `self.processing_target` and
// `self.to_be_downloaded`). In this case, eventually the chain advances to the
// batch (`self.processing_target` reaches this point).
debug!(self.log, "Chain encountered a robust batch awaiting validation"; "batch" => self.processing_target);
debug!(
batch = %self.processing_target,
"Chain encountered a robust batch awaiting validation"
);
self.processing_target += EPOCHS_PER_BATCH;
if self.to_be_downloaded <= self.processing_target {
@@ -436,6 +456,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
/// The block processor has completed processing a batch. This function handles the result
/// of the batch processor.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn on_batch_process_result(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -447,13 +468,11 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
let batch_state = self.visualize_batch_state();
let batch = match &self.current_processing_batch {
Some(processing_id) if *processing_id != batch_id => {
debug!(self.log, "Unexpected batch result";
"batch_epoch" => batch_id, "expected_batch_epoch" => processing_id);
debug!(batch_epoch = %batch_id, expected_batch_epoch = %processing_id,"Unexpected batch result");
return Ok(KeepChain);
}
None => {
debug!(self.log, "Chain was not expecting a batch result";
"batch_epoch" => batch_id);
debug!(batch_epoch = %batch_id,"Chain was not expecting a batch result");
return Ok(KeepChain);
}
_ => {
@@ -476,8 +495,14 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
})?;
// Log the process result and the batch for debugging purposes.
debug!(self.log, "Batch processing result"; "result" => ?result, &batch,
"batch_epoch" => batch_id, "client" => %network.client_type(&peer), "batch_state" => batch_state);
debug!(
result = ?result,
batch_epoch = %batch_id,
client = %network.client_type(&peer),
batch_state = ?batch_state,
?batch,
"Batch processing result"
);
// We consider three cases. Batch was successfully processed, Batch failed processing due
// to a faulty peer, or batch failed processing but the peer can't be deemed faulty.
@@ -563,10 +588,9 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// There are some edge cases with forks that could land us in this situation.
// This should be unlikely, so we tolerate these errors, but not often.
warn!(
self.log,
"Batch failed to download. Dropping chain scoring peers";
"score_adjustment" => %penalty,
"batch_epoch"=> batch_id,
score_adjustment = %penalty,
batch_epoch = %batch_id,
"Batch failed to download. Dropping chain scoring peers"
);
for (peer, _) in self.peers.drain() {
@@ -587,6 +611,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
}
}
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
fn reject_optimistic_batch(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -599,13 +624,13 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// it. NOTE: this is done to prevent non-sequential batches coming from optimistic
// starts from filling up the buffer size
if epoch < self.to_be_downloaded {
debug!(self.log, "Rejected optimistic batch left for future use"; "epoch" => %epoch, "reason" => reason);
debug!(%epoch, reason, "Rejected optimistic batch left for future use");
// this batch is now treated as any other batch, and re-requested for future use
if redownload {
return self.retry_batch_download(network, epoch);
}
} else {
debug!(self.log, "Rejected optimistic batch"; "epoch" => %epoch, "reason" => reason);
debug!(%epoch, reason, "Rejected optimistic batch");
self.batches.remove(&epoch);
}
}
@@ -621,6 +646,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
/// If a previous batch has been validated and it had been re-processed, penalize the original
/// peer.
#[allow(clippy::modulo_one)]
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
fn advance_chain(&mut self, network: &mut SyncNetworkContext<T>, validating_epoch: Epoch) {
// make sure this epoch produces an advancement
if validating_epoch <= self.start_epoch {
@@ -629,7 +655,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// safety check for batch boundaries
if validating_epoch % EPOCHS_PER_BATCH != self.start_epoch % EPOCHS_PER_BATCH {
crit!(self.log, "Validating Epoch is not aligned");
crit!("Validating Epoch is not aligned");
return;
}
@@ -651,9 +677,10 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// A different peer sent the correct batch, the previous peer did not
// We negatively score the original peer.
let action = PeerAction::LowToleranceError;
debug!(self.log, "Re-processed batch validated. Scoring original peer";
"batch_epoch" => id, "score_adjustment" => %action,
"original_peer" => %attempt.peer_id, "new_peer" => %processed_attempt.peer_id
debug!(
batch_epoch = %id, score_adjustment = %action,
original_peer = %attempt.peer_id, new_peer = %processed_attempt.peer_id,
"Re-processed batch validated. Scoring original peer"
);
network.report_peer(
attempt.peer_id,
@@ -664,9 +691,12 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// The same peer corrected it's previous mistake. There was an error, so we
// negative score the original peer.
let action = PeerAction::MidToleranceError;
debug!(self.log, "Re-processed batch validated by the same peer";
"batch_epoch" => id, "score_adjustment" => %action,
"original_peer" => %attempt.peer_id, "new_peer" => %processed_attempt.peer_id
debug!(
batch_epoch = %id,
score_adjustment = %action,
original_peer = %attempt.peer_id,
new_peer = %processed_attempt.peer_id,
"Re-processed batch validated by the same peer"
);
network.report_peer(
attempt.peer_id,
@@ -683,13 +713,12 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
active_batches.remove(&id);
}
}
BatchState::Failed | BatchState::Poisoned | BatchState::AwaitingDownload => crit!(
self.log,
"batch indicates inconsistent chain state while advancing chain"
),
BatchState::Failed | BatchState::Poisoned | BatchState::AwaitingDownload => {
crit!("batch indicates inconsistent chain state while advancing chain")
}
BatchState::AwaitingProcessing(..) => {}
BatchState::Processing(_) => {
debug!(self.log, "Advancing chain while processing a batch"; "batch" => id, batch);
debug!(batch = %id, %batch, "Advancing chain while processing a batch");
if let Some(processing_id) = self.current_processing_batch {
if id <= processing_id {
self.current_processing_batch = None;
@@ -713,8 +742,12 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
self.optimistic_start = None;
}
}
debug!(self.log, "Chain advanced"; "previous_start" => old_start,
"new_start" => self.start_epoch, "processing_target" => self.processing_target);
debug!(
previous_start = %old_start,
new_start = %self.start_epoch,
processing_target = %self.processing_target,
"Chain advanced"
);
}
/// An invalid batch has been received that could not be processed, but that can be retried.
@@ -722,6 +755,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
/// These events occur when a peer has successfully responded with blocks, but the blocks we
/// have received are incorrect or invalid. This indicates the peer has not performed as
/// intended and can result in downvoting a peer.
#[instrument(parent = None,level = "info", fields(service = self.id, network), skip_all)]
fn handle_invalid_batch(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -781,6 +815,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
/// This chain has been requested to start syncing.
///
/// This could be new chain, or an old chain that is being resumed.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn start_syncing(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -819,6 +854,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
/// Add a peer to the chain.
///
/// If the chain is active, this starts requesting batches from this peer.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn add_peer(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -836,6 +872,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
/// An RPC error has occurred.
///
/// If the batch exists it is re-requested.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn inject_error(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -852,24 +889,21 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// columns.
if !batch.is_expecting_block(&request_id) {
debug!(
self.log,
"Batch not expecting block";
"batch_epoch" => batch_id,
"batch_state" => ?batch.state(),
"peer_id" => %peer_id,
"request_id" => %request_id,
"batch_state" => batch_state
batch_epoch = %batch_id,
batch_state = ?batch.state(),
%peer_id,
%request_id,
?batch_state,
"Batch not expecting block"
);
return Ok(KeepChain);
}
debug!(
self.log,
"Batch failed. RPC Error";
"batch_epoch" => batch_id,
"batch_state" => ?batch.state(),
"peer_id" => %peer_id,
"request_id" => %request_id,
"batch_state" => batch_state
batch_epoch = %batch_id,
batch_state = ?batch.state(),
%peer_id,
%request_id,
"Batch failed. RPC Error"
);
if let Some(active_requests) = self.peers.get_mut(peer_id) {
active_requests.remove(&batch_id);
@@ -883,12 +917,11 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
self.retry_batch_download(network, batch_id)
} else {
debug!(
self.log,
"Batch not found";
"batch_epoch" => batch_id,
"peer_id" => %peer_id,
"request_id" => %request_id,
"batch_state" => batch_state
batch_epoch = %batch_id,
%peer_id,
%request_id,
batch_state,
"Batch not found"
);
// this could be an error for an old batch, removed when the chain advances
Ok(KeepChain)
@@ -896,6 +929,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
}
/// Sends and registers the request of a batch awaiting download.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn retry_batch_download(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -932,6 +966,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
}
/// Requests the batch assigned to the given id from a given peer.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn send_batch(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -958,9 +993,9 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
.map(|epoch| epoch == batch_id)
.unwrap_or(false)
{
debug!(self.log, "Requesting optimistic batch"; "epoch" => batch_id, &batch, "batch_state" => batch_state);
debug!(epoch = %batch_id, %batch, %batch_state, "Requesting optimistic batch");
} else {
debug!(self.log, "Requesting batch"; "epoch" => batch_id, &batch, "batch_state" => batch_state);
debug!(epoch = %batch_id, %batch, %batch_state, "Requesting batch");
}
// register the batch for this peer
return self
@@ -979,8 +1014,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
}
Err(e) => {
// NOTE: under normal conditions this shouldn't happen but we handle it anyway
warn!(self.log, "Could not send batch request";
"batch_id" => batch_id, "error" => ?e, &batch);
warn!(%batch_id, error = %e, %batch, "Could not send batch request");
// register the failed download and check if the batch can be retried
batch.start_downloading_from_peer(peer, 1)?; // fake request_id is not relevant
self.peers
@@ -1005,6 +1039,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
}
/// Returns true if this chain is currently syncing.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn is_syncing(&self) -> bool {
match self.state {
ChainSyncingState::Syncing => true,
@@ -1014,6 +1049,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
/// Kickstarts the chain by sending for processing batches that are ready and requesting more
/// batches if needed.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn resume(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -1026,6 +1062,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
/// Attempts to request the next required batches from the peer pool if the chain is syncing. It will exhaust the peer
/// pool and left over batches until the batch buffer is reached or all peers are exhausted.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
fn request_batches(&mut self, network: &mut SyncNetworkContext<T>) -> ProcessingResult {
if !matches!(self.state, ChainSyncingState::Syncing) {
return Ok(KeepChain);
@@ -1052,10 +1089,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// We wait for this batch before requesting any other batches.
if let Some(epoch) = self.optimistic_start {
if !self.good_peers_on_sampling_subnets(epoch, network) {
debug!(
self.log,
"Waiting for peers to be available on sampling column subnets"
);
debug!("Waiting for peers to be available on sampling column subnets");
return Ok(KeepChain);
}
@@ -1114,6 +1148,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
/// Creates the next required batch from the chain. If there are no more batches required,
/// `false` is returned.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
fn include_next_batch(&mut self, network: &mut SyncNetworkContext<T>) -> Option<BatchId> {
// don't request batches beyond the target head slot
if self
@@ -1147,10 +1182,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// block and data column requests are currently coupled. This can be removed once we find a
// way to decouple the requests and do retries individually, see issue #6258.
if !self.good_peers_on_sampling_subnets(self.to_be_downloaded, network) {
debug!(
self.log,
"Waiting for peers to be available on custody column subnets"
);
debug!("Waiting for peers to be available on custody column subnets");
return None;
}
@@ -1177,6 +1209,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
/// This produces a string of the form: [D,E,E,E,E]
/// to indicate the current buffer state of the chain. The symbols are defined on each of the
/// batch states. See [BatchState::visualize] for symbol definitions.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
fn visualize_batch_state(&self) -> String {
let mut visualization_string = String::with_capacity((BATCH_BUFFER_SIZE * 3) as usize);
@@ -1212,45 +1245,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
}
}
impl<T: BeaconChainTypes> slog::KV for &mut SyncingChain<T> {
fn serialize(
&self,
record: &slog::Record,
serializer: &mut dyn slog::Serializer,
) -> slog::Result {
slog::KV::serialize(*self, record, serializer)
}
}
impl<T: BeaconChainTypes> slog::KV for SyncingChain<T> {
fn serialize(
&self,
record: &slog::Record,
serializer: &mut dyn slog::Serializer,
) -> slog::Result {
use slog::Value;
serializer.emit_u32("id", self.id)?;
Value::serialize(&self.start_epoch, record, "from", serializer)?;
Value::serialize(
&self.target_head_slot.epoch(T::EthSpec::slots_per_epoch()),
record,
"to",
serializer,
)?;
serializer.emit_arguments("end_root", &format_args!("{}", self.target_head_root))?;
Value::serialize(
&self.processing_target,
record,
"current_target",
serializer,
)?;
serializer.emit_usize("batches", self.batches.len())?;
serializer.emit_usize("peers", self.peers.len())?;
serializer.emit_arguments("state", &format_args!("{:?}", self.state))?;
slog::Result::Ok(())
}
}
use super::batch::WrongState as WrongBatchState;
impl From<WrongBatchState> for RemoveChain {
fn from(err: WrongBatchState) -> Self {

View File

@@ -12,11 +12,12 @@ use fnv::FnvHashMap;
use lighthouse_network::service::api_types::Id;
use lighthouse_network::PeerId;
use lighthouse_network::SyncInfo;
use slog::{crit, debug, error};
use logging::crit;
use smallvec::SmallVec;
use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::sync::Arc;
use tracing::{debug, error};
use types::EthSpec;
use types::{Epoch, Hash256, Slot};
@@ -50,18 +51,15 @@ pub struct ChainCollection<T: BeaconChainTypes> {
head_chains: FnvHashMap<ChainId, SyncingChain<T>>,
/// The current sync state of the process.
state: RangeSyncState,
/// Logger for the collection.
log: slog::Logger,
}
impl<T: BeaconChainTypes> ChainCollection<T> {
pub fn new(beacon_chain: Arc<BeaconChain<T>>, log: slog::Logger) -> Self {
pub fn new(beacon_chain: Arc<BeaconChain<T>>) -> Self {
ChainCollection {
beacon_chain,
finalized_chains: FnvHashMap::default(),
head_chains: FnvHashMap::default(),
state: RangeSyncState::Idle,
log,
}
}
@@ -295,9 +293,8 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
.expect("Chain exists");
match old_id {
Some(Some(old_id)) => debug!(self.log, "Switching finalized chains";
"old_id" => old_id, &chain),
None => debug!(self.log, "Syncing new finalized chain"; &chain),
Some(Some(old_id)) => debug!(old_id, %chain, "Switching finalized chains"),
None => debug!(%chain, "Syncing new finalized chain"),
Some(None) => {
// this is the same chain. We try to advance it.
}
@@ -309,10 +306,10 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
if let Err(remove_reason) = chain.start_syncing(network, local_epoch, local_head_epoch)
{
if remove_reason.is_critical() {
crit!(self.log, "Chain removed while switching chains"; "chain" => new_id, "reason" => ?remove_reason);
crit!(chain = new_id, reason = ?remove_reason, "Chain removed while switching chains");
} else {
// this happens only if sending a batch over the `network` fails a lot
error!(self.log, "Chain removed while switching chains"; "chain" => new_id, "reason" => ?remove_reason);
error!(chain = new_id, reason = ?remove_reason, "Chain removed while switching chains");
}
self.finalized_chains.remove(&new_id);
self.on_chain_removed(&new_id, true, RangeSyncType::Finalized);
@@ -330,7 +327,7 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
) {
// Include the awaiting head peers
for (peer_id, peer_sync_info) in awaiting_head_peers.drain() {
debug!(self.log, "including head peer");
debug!("including head peer");
self.add_peer_or_create_chain(
local_epoch,
peer_sync_info.head_root,
@@ -362,16 +359,16 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
if syncing_chains.len() < PARALLEL_HEAD_CHAINS {
// start this chain if it's not already syncing
if !chain.is_syncing() {
debug!(self.log, "New head chain started syncing"; &chain);
debug!(%chain, "New head chain started syncing");
}
if let Err(remove_reason) =
chain.start_syncing(network, local_epoch, local_head_epoch)
{
self.head_chains.remove(&id);
if remove_reason.is_critical() {
crit!(self.log, "Chain removed while switching head chains"; "chain" => id, "reason" => ?remove_reason);
crit!(chain = id, reason = ?remove_reason, "Chain removed while switching head chains");
} else {
error!(self.log, "Chain removed while switching head chains"; "chain" => id, "reason" => ?remove_reason);
error!(chain = id, reason = ?remove_reason, "Chain removed while switching head chains");
}
} else {
syncing_chains.push(id);
@@ -407,7 +404,6 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
.start_slot(T::EthSpec::slots_per_epoch());
let beacon_chain = &self.beacon_chain;
let log_ref = &self.log;
let is_outdated = |target_slot: &Slot, target_root: &Hash256| {
target_slot <= &local_finalized_slot
@@ -425,7 +421,7 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
if is_outdated(&chain.target_head_slot, &chain.target_head_root)
|| chain.available_peers() == 0
{
debug!(log_ref, "Purging out of finalized chain"; &chain);
debug!(%chain, "Purging out of finalized chain");
Some((*id, chain.is_syncing(), RangeSyncType::Finalized))
} else {
None
@@ -436,7 +432,7 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
if is_outdated(&chain.target_head_slot, &chain.target_head_root)
|| chain.available_peers() == 0
{
debug!(log_ref, "Purging out of date head chain"; &chain);
debug!(%chain, "Purging out of date head chain");
Some((*id, chain.is_syncing(), RangeSyncType::Head))
} else {
None
@@ -477,14 +473,14 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
.find(|(_, chain)| chain.has_same_target(target_head_slot, target_head_root))
{
Some((&id, chain)) => {
debug!(self.log, "Adding peer to known chain"; "peer_id" => %peer, "sync_type" => ?sync_type, "id" => id);
debug!(peer_id = %peer, ?sync_type, id, "Adding peer to known chain");
debug_assert_eq!(chain.target_head_root, target_head_root);
debug_assert_eq!(chain.target_head_slot, target_head_slot);
if let Err(remove_reason) = chain.add_peer(network, peer) {
if remove_reason.is_critical() {
crit!(self.log, "Chain removed after adding peer"; "chain" => id, "reason" => ?remove_reason);
crit!(chain = %id, reason = ?remove_reason, "Chain removed after adding peer");
} else {
error!(self.log, "Chain removed after adding peer"; "chain" => id, "reason" => ?remove_reason);
error!(chain = %id, reason = ?remove_reason, "Chain removed after adding peer");
}
let is_syncing = chain.is_syncing();
collection.remove(&id);
@@ -501,9 +497,9 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
target_head_root,
peer,
sync_type.into(),
&self.log,
);
debug!(self.log, "New chain added to sync"; "peer_id" => peer_rpr, "sync_type" => ?sync_type, &new_chain);
debug!(peer_id = peer_rpr, ?sync_type, %new_chain, "New chain added to sync");
collection.insert(id, new_chain);
metrics::inc_counter_vec(&metrics::SYNCING_CHAINS_ADDED, &[sync_type.as_str()]);
self.update_metrics();

View File

@@ -51,10 +51,11 @@ use beacon_chain::{BeaconChain, BeaconChainTypes};
use lighthouse_network::rpc::GoodbyeReason;
use lighthouse_network::service::api_types::Id;
use lighthouse_network::{PeerId, SyncInfo};
use logging::crit;
use lru_cache::LRUTimeCache;
use slog::{crit, debug, trace, warn};
use std::collections::HashMap;
use std::sync::Arc;
use tracing::{debug, instrument, trace, warn};
use types::{Epoch, EthSpec, Hash256};
/// For how long we store failed finalized chains to prevent retries.
@@ -74,23 +75,26 @@ pub struct RangeSync<T: BeaconChainTypes> {
chains: ChainCollection<T>,
/// Chains that have failed and are stored to prevent being retried.
failed_chains: LRUTimeCache<Hash256>,
/// The syncing logger.
log: slog::Logger,
}
impl<T: BeaconChainTypes> RangeSync<T>
where
T: BeaconChainTypes,
{
pub fn new(beacon_chain: Arc<BeaconChain<T>>, log: slog::Logger) -> Self {
#[instrument(parent = None,
level = "info",
fields(component = "range_sync"),
name = "range_sync",
skip_all
)]
pub fn new(beacon_chain: Arc<BeaconChain<T>>) -> Self {
RangeSync {
beacon_chain: beacon_chain.clone(),
chains: ChainCollection::new(beacon_chain, log.clone()),
chains: ChainCollection::new(beacon_chain),
failed_chains: LRUTimeCache::new(std::time::Duration::from_secs(
FAILED_CHAINS_EXPIRY_SECONDS,
)),
awaiting_head_peers: HashMap::new(),
log,
}
}
@@ -99,6 +103,12 @@ where
self.failed_chains.keys().copied().collect()
}
#[instrument(parent = None,
level = "info",
fields(component = "range_sync"),
name = "range_sync",
skip_all
)]
pub fn state(&self) -> SyncChainStatus {
self.chains.state()
}
@@ -108,6 +118,12 @@ where
/// may need to be synced as a result. A new peer, may increase the peer pool of a finalized
/// chain, this may result in a different finalized chain from syncing as finalized chains are
/// prioritised by peer-pool size.
#[instrument(parent = None,
level = "info",
fields(component = "range_sync"),
name = "range_sync",
skip_all
)]
pub fn add_peer(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -133,14 +149,13 @@ where
RangeSyncType::Finalized => {
// Make sure we have not recently tried this chain
if self.failed_chains.contains(&remote_info.finalized_root) {
debug!(self.log, "Disconnecting peer that belongs to previously failed chain";
"failed_root" => %remote_info.finalized_root, "peer_id" => %peer_id);
debug!(failed_root = ?remote_info.finalized_root, %peer_id,"Disconnecting peer that belongs to previously failed chain");
network.goodbye_peer(peer_id, GoodbyeReason::IrrelevantNetwork);
return;
}
// Finalized chain search
debug!(self.log, "Finalization sync peer joined"; "peer_id" => %peer_id);
debug!(%peer_id, "Finalization sync peer joined");
self.awaiting_head_peers.remove(&peer_id);
// Because of our change in finalized sync batch size from 2 to 1 and our transition
@@ -171,8 +186,7 @@ where
if self.chains.is_finalizing_sync() {
// If there are finalized chains to sync, finish these first, before syncing head
// chains.
trace!(self.log, "Waiting for finalized sync to complete";
"peer_id" => %peer_id, "awaiting_head_peers" => &self.awaiting_head_peers.len());
trace!(%peer_id, awaiting_head_peers = &self.awaiting_head_peers.len(),"Waiting for finalized sync to complete");
self.awaiting_head_peers.insert(peer_id, remote_info);
return;
}
@@ -204,6 +218,12 @@ where
///
/// This function finds the chain that made this request. Once found, processes the result.
/// This request could complete a chain or simply add to its progress.
#[instrument(parent = None,
level = "info",
fields(component = "range_sync"),
name = "range_sync",
skip_all
)]
pub fn blocks_by_range_response(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -229,11 +249,17 @@ where
}
}
Err(_) => {
trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
trace!(%chain_id, "BlocksByRange response for removed chain")
}
}
}
#[instrument(parent = None,
level = "info",
fields(component = "range_sync"),
name = "range_sync",
skip_all
)]
pub fn handle_block_process_result(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -259,13 +285,19 @@ where
}
Err(_) => {
trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
trace!(%chain_id, "BlocksByRange response for removed chain")
}
}
}
/// A peer has disconnected. This removes the peer from any ongoing chains and mappings. A
/// disconnected peer could remove a chain
#[instrument(parent = None,
level = "info",
fields(component = "range_sync"),
name = "range_sync",
skip_all
)]
pub fn peer_disconnect(&mut self, network: &mut SyncNetworkContext<T>, peer_id: &PeerId) {
// if the peer is in the awaiting head mapping, remove it
self.awaiting_head_peers.remove(peer_id);
@@ -278,6 +310,12 @@ where
/// which pool the peer is in. The chain may also have a batch or batches awaiting
/// for this peer. If so we mark the batch as failed. The batch may then hit it's maximum
/// retries. In this case, we need to remove the chain.
#[instrument(parent = None,
level = "info",
fields(component = "range_sync"),
name = "range_sync",
skip_all
)]
fn remove_peer(&mut self, network: &mut SyncNetworkContext<T>, peer_id: &PeerId) {
for (removed_chain, sync_type, remove_reason) in self
.chains
@@ -297,6 +335,12 @@ where
///
/// Check to see if the request corresponds to a pending batch. If so, re-request it if possible, if there have
/// been too many failed attempts for the batch, remove the chain.
#[instrument(parent = None,
level = "info",
fields(component = "range_sync"),
name = "range_sync",
skip_all
)]
pub fn inject_error(
&mut self,
network: &mut SyncNetworkContext<T>,
@@ -321,11 +365,17 @@ where
}
}
Err(_) => {
trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
trace!(%chain_id, "BlocksByRange response for removed chain")
}
}
}
#[instrument(parent = None,
level = "info",
fields(component = "range_sync"),
name = "range_sync",
skip_all
)]
fn on_chain_removed(
&mut self,
chain: SyncingChain<T>,
@@ -335,14 +385,18 @@ where
op: &'static str,
) {
if remove_reason.is_critical() {
crit!(self.log, "Chain removed"; "sync_type" => ?sync_type, &chain, "reason" => ?remove_reason, "op" => op);
crit!(?sync_type, %chain, reason = ?remove_reason,op, "Chain removed");
} else {
debug!(self.log, "Chain removed"; "sync_type" => ?sync_type, &chain, "reason" => ?remove_reason, "op" => op);
debug!(?sync_type, %chain, reason = ?remove_reason,op, "Chain removed");
}
if let RemoveChain::ChainFailed { blacklist, .. } = remove_reason {
if RangeSyncType::Finalized == sync_type && blacklist {
warn!(self.log, "Chain failed! Syncing to its head won't be retried for at least the next {} seconds", FAILED_CHAINS_EXPIRY_SECONDS; &chain);
warn!(
%chain,
"Chain failed! Syncing to its head won't be retried for at least the next {} seconds",
FAILED_CHAINS_EXPIRY_SECONDS
);
self.failed_chains.insert(chain.target_head_root);
}
}
@@ -369,6 +423,12 @@ where
}
/// Kickstarts sync.
#[instrument(parent = None,
level = "info",
fields(component = "range_sync"),
name = "range_sync",
skip_all
)]
pub fn resume(&mut self, network: &mut SyncNetworkContext<T>) {
for (removed_chain, sync_type, remove_reason) in
self.chains.call_all(|chain| chain.resume(network))

View File

@@ -19,8 +19,8 @@ use beacon_chain::{
block_verification_types::{AsBlock, BlockImportData},
data_availability_checker::Availability,
test_utils::{
build_log, generate_rand_block_and_blobs, generate_rand_block_and_data_columns, test_spec,
BeaconChainHarness, EphemeralHarnessType, LoggerType, NumBlobs,
generate_rand_block_and_blobs, generate_rand_block_and_data_columns, test_spec,
BeaconChainHarness, EphemeralHarnessType, NumBlobs,
},
validator_monitor::timestamp_now,
AvailabilityPendingExecutedBlock, AvailabilityProcessingStatus, BlockError,
@@ -37,9 +37,9 @@ use lighthouse_network::{
types::SyncState,
NetworkConfig, NetworkGlobals, PeerId,
};
use slog::info;
use slot_clock::{SlotClock, TestingSlotClock};
use tokio::sync::mpsc;
use tracing::info;
use types::{
data_column_sidecar::ColumnIndex,
test_utils::{SeedableRng, TestRandom, XorShiftRng},
@@ -55,22 +55,12 @@ type DCByRootId = (SyncRequestId, Vec<ColumnIndex>);
impl TestRig {
pub fn test_setup() -> Self {
let logger_type = if cfg!(feature = "test_logger") {
LoggerType::Test
} else if cfg!(feature = "ci_logger") {
LoggerType::CI
} else {
LoggerType::Null
};
let log = build_log(slog::Level::Trace, logger_type);
// Use `fork_from_env` logic to set correct fork epochs
let spec = test_spec::<E>();
// Initialise a new beacon chain
let harness = BeaconChainHarness::<EphemeralHarnessType<E>>::builder(E)
.spec(Arc::new(spec))
.logger(log.clone())
.deterministic_keypairs(1)
.fresh_ephemeral_store()
.mock_execution_layer()
@@ -95,7 +85,6 @@ impl TestRig {
let network_config = Arc::new(NetworkConfig::default());
let globals = Arc::new(NetworkGlobals::new_test_globals(
Vec::new(),
&log,
network_config,
chain.spec.clone(),
));
@@ -104,7 +93,6 @@ impl TestRig {
sync_tx,
chain.clone(),
harness.runtime.task_executor.clone(),
log.clone(),
);
let fork_name = chain.spec.fork_name_at_slot::<E>(chain.slot().unwrap());
@@ -137,11 +125,9 @@ impl TestRig {
required_successes: vec![SAMPLING_REQUIRED_SUCCESSES],
},
fork_context,
log.clone(),
),
harness,
fork_name,
log,
spec,
}
}
@@ -165,7 +151,7 @@ impl TestRig {
}
pub fn log(&self, msg: &str) {
info!(self.log, "TEST_RIG"; "msg" => msg);
info!(msg, "TEST_RIG");
}
pub fn after_deneb(&self) -> bool {
@@ -2318,11 +2304,6 @@ mod deneb_only {
})
}
fn log(self, msg: &str) -> Self {
self.rig.log(msg);
self
}
fn trigger_unknown_block_from_attestation(mut self) -> Self {
let block_root = self.block.canonical_root();
self.rig
@@ -2626,6 +2607,11 @@ mod deneb_only {
.block_imported()
}
fn log(self, msg: &str) -> Self {
self.rig.log(msg);
self
}
fn parent_block_then_empty_parent_blobs(self) -> Self {
self.log(
" Return empty blobs for parent, block errors with missing components, downscore",

View File

@@ -8,7 +8,6 @@ use beacon_chain::test_utils::{BeaconChainHarness, EphemeralHarnessType};
use beacon_processor::WorkEvent;
use lighthouse_network::NetworkGlobals;
use rand_chacha::ChaCha20Rng;
use slog::Logger;
use slot_clock::ManualSlotClock;
use std::sync::Arc;
use store::MemoryStore;
@@ -64,6 +63,5 @@ struct TestRig {
/// `rng` for generating test blocks and blobs.
rng: ChaCha20Rng,
fork_name: ForkName,
log: Logger,
spec: Arc<ChainSpec>,
}