mirror of
https://github.com/sigp/lighthouse.git
synced 2026-03-07 02:31:45 +00:00
Integrate tracing (#6339)
Tracing Integration
- [reference](5bbf1859e9/projects/project-ideas.md (L297))
- [x] replace slog & log with tracing throughout the codebase
- [x] implement custom crit log
- [x] make relevant changes in the formatter
- [x] replace sloggers
- [x] re-write SSE logging components
cc: @macladson @eserilev
This commit is contained in:
@@ -20,13 +20,14 @@ use beacon_chain::{BeaconChain, BeaconChainTypes};
|
||||
use lighthouse_network::service::api_types::Id;
|
||||
use lighthouse_network::types::{BackFillState, NetworkGlobals};
|
||||
use lighthouse_network::{PeerAction, PeerId};
|
||||
use logging::crit;
|
||||
use rand::seq::SliceRandom;
|
||||
use slog::{crit, debug, error, info, warn};
|
||||
use std::collections::{
|
||||
btree_map::{BTreeMap, Entry},
|
||||
HashMap, HashSet,
|
||||
};
|
||||
use std::sync::Arc;
|
||||
use tracing::{debug, error, info, instrument, warn};
|
||||
use types::{Epoch, EthSpec};
|
||||
|
||||
/// Blocks are downloaded in batches from peers. This constant specifies how many epochs worth of
|
||||
@@ -146,16 +147,17 @@ pub struct BackFillSync<T: BeaconChainTypes> {
|
||||
/// Reference to the network globals in order to obtain valid peers to backfill blocks from
|
||||
/// (i.e synced peers).
|
||||
network_globals: Arc<NetworkGlobals<T::EthSpec>>,
|
||||
|
||||
/// A logger for backfill sync.
|
||||
log: slog::Logger,
|
||||
}
|
||||
|
||||
impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn new(
|
||||
beacon_chain: Arc<BeaconChain<T>>,
|
||||
network_globals: Arc<NetworkGlobals<T::EthSpec>>,
|
||||
log: slog::Logger,
|
||||
) -> Self {
|
||||
// Determine if backfill is enabled or not.
|
||||
// If, for some reason a backfill has already been completed (or we've used a trusted
|
||||
@@ -186,7 +188,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
participating_peers: HashSet::new(),
|
||||
restart_failed_sync: false,
|
||||
beacon_chain,
|
||||
log,
|
||||
};
|
||||
|
||||
// Update the global network state with the current backfill state.
|
||||
@@ -195,9 +196,15 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
}
|
||||
|
||||
/// Pauses the backfill sync if it's currently syncing.
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn pause(&mut self) {
|
||||
if let BackFillState::Syncing = self.state() {
|
||||
debug!(self.log, "Backfill sync paused"; "processed_epochs" => self.validated_batches, "to_be_processed" => self.current_start);
|
||||
debug!(processed_epochs = %self.validated_batches, to_be_processed = %self.current_start,"Backfill sync paused");
|
||||
self.set_state(BackFillState::Paused);
|
||||
}
|
||||
}
|
||||
@@ -206,6 +213,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
///
|
||||
/// If resuming is successful, reports back the current syncing metrics.
|
||||
#[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"]
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn start(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -222,7 +235,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
.is_some()
|
||||
{
|
||||
// If there are peers to resume with, begin the resume.
|
||||
debug!(self.log, "Resuming backfill sync"; "start_epoch" => self.current_start, "awaiting_batches" => self.batches.len(), "processing_target" => self.processing_target);
|
||||
debug!(start_epoch = ?self.current_start, awaiting_batches = self.batches.len(), processing_target = ?self.processing_target, "Resuming backfill sync");
|
||||
self.set_state(BackFillState::Syncing);
|
||||
// Resume any previously failed batches.
|
||||
self.resume_batches(network)?;
|
||||
@@ -251,14 +264,14 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
// This infallible match exists to force us to update this code if a future
|
||||
// refactor of `ResetEpochError` adds a variant.
|
||||
let ResetEpochError::SyncCompleted = e;
|
||||
error!(self.log, "Backfill sync completed whilst in failed status");
|
||||
error!("Backfill sync completed whilst in failed status");
|
||||
self.set_state(BackFillState::Completed);
|
||||
return Err(BackFillError::InvalidSyncState(String::from(
|
||||
"chain completed",
|
||||
)));
|
||||
}
|
||||
|
||||
debug!(self.log, "Resuming a failed backfill sync"; "start_epoch" => self.current_start);
|
||||
debug!(start_epoch = %self.current_start, "Resuming a failed backfill sync");
|
||||
|
||||
// begin requesting blocks from the peer pool, until all peers are exhausted.
|
||||
self.request_batches(network)?;
|
||||
@@ -281,6 +294,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
/// A fully synced peer has joined us.
|
||||
/// If we are in a failed state, update a local variable to indicate we are able to restart
|
||||
/// the failed sync on the next attempt.
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn fully_synced_peer_joined(&mut self) {
|
||||
if matches!(self.state(), BackFillState::Failed) {
|
||||
self.restart_failed_sync = true;
|
||||
@@ -289,6 +308,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
|
||||
/// A peer has disconnected.
|
||||
/// If the peer has active batches, those are considered failed and re-requested.
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
#[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"]
|
||||
pub fn peer_disconnected(
|
||||
&mut self,
|
||||
@@ -318,15 +343,13 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
// short circuit early.
|
||||
if self.retry_batch_download(network, id).is_err() {
|
||||
debug!(
|
||||
self.log,
|
||||
"Batch could not be retried";
|
||||
"batch_id" => id,
|
||||
"error" => "no synced peers"
|
||||
batch_id = %id,
|
||||
error = "no synced peers",
|
||||
"Batch could not be retried"
|
||||
);
|
||||
}
|
||||
} else {
|
||||
debug!(self.log, "Batch not found while removing peer";
|
||||
"peer" => %peer_id, "batch" => id)
|
||||
debug!(peer = %peer_id, batch = %id, "Batch not found while removing peer");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -339,6 +362,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
/// An RPC error has occurred.
|
||||
///
|
||||
/// If the batch exists it is re-requested.
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
#[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"]
|
||||
pub fn inject_error(
|
||||
&mut self,
|
||||
@@ -356,7 +385,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
if !batch.is_expecting_block(&request_id) {
|
||||
return Ok(());
|
||||
}
|
||||
debug!(self.log, "Batch failed"; "batch_epoch" => batch_id, "error" => "rpc_error");
|
||||
debug!(batch_epoch = %batch_id, error = "rpc_error", "Batch failed");
|
||||
if let Some(active_requests) = self.active_requests.get_mut(peer_id) {
|
||||
active_requests.remove(&batch_id);
|
||||
}
|
||||
@@ -378,6 +407,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
/// If this returns an error, the backfill sync has failed and will be restarted once new peers
|
||||
/// join the system.
|
||||
/// The sync manager should update the global sync state on failure.
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
#[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"]
|
||||
pub fn on_block_response(
|
||||
&mut self,
|
||||
@@ -391,7 +426,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
let Some(batch) = self.batches.get_mut(&batch_id) else {
|
||||
if !matches!(self.state(), BackFillState::Failed) {
|
||||
// A batch might get removed when the chain advances, so this is non fatal.
|
||||
debug!(self.log, "Received a block for unknown batch"; "epoch" => batch_id);
|
||||
debug!(epoch = %batch_id, "Received a block for unknown batch");
|
||||
}
|
||||
return Ok(ProcessResult::Successful);
|
||||
};
|
||||
@@ -416,7 +451,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
Ok(received) => {
|
||||
let awaiting_batches =
|
||||
self.processing_target.saturating_sub(batch_id) / BACKFILL_EPOCHS_PER_BATCH;
|
||||
debug!(self.log, "Completed batch received"; "epoch" => batch_id, "blocks" => received, "awaiting_batches" => awaiting_batches);
|
||||
debug!(
|
||||
epoch = %batch_id,
|
||||
blocks = received,
|
||||
%awaiting_batches,
|
||||
"Completed batch received"
|
||||
);
|
||||
|
||||
// pre-emptively request more blocks from peers whilst we process current blocks,
|
||||
self.request_batches(network)?;
|
||||
@@ -432,6 +472,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
/// The syncing process has failed.
|
||||
///
|
||||
/// This resets past variables, to allow for a fresh start when resuming.
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn fail_sync(&mut self, error: BackFillError) -> Result<(), BackFillError> {
|
||||
// Some errors shouldn't fail the chain.
|
||||
if matches!(error, BackFillError::Paused) {
|
||||
@@ -455,7 +501,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
// NOTE: Lets keep validated_batches for posterity
|
||||
|
||||
// Emit the log here
|
||||
error!(self.log, "Backfill sync failed"; "error" => ?error);
|
||||
error!(?error, "Backfill sync failed");
|
||||
|
||||
// Return the error, kinda weird pattern, but I want to use
|
||||
// `self.fail_chain(_)?` in other parts of the code.
|
||||
@@ -464,6 +510,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
|
||||
/// Processes the batch with the given id.
|
||||
/// The batch must exist and be ready for processing
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn process_batch(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -503,8 +555,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
.beacon_processor()
|
||||
.send_chain_segment(process_id, blocks)
|
||||
{
|
||||
crit!(self.log, "Failed to send backfill segment to processor."; "msg" => "process_batch",
|
||||
"error" => %e, "batch" => self.processing_target);
|
||||
crit!(
|
||||
msg = "process_batch",
|
||||
error = %e,
|
||||
batch = ?self.processing_target,
|
||||
"Failed to send backfill segment to processor."
|
||||
);
|
||||
// This is unlikely to happen but it would stall syncing since the batch now has no
|
||||
// blocks to continue, and the chain is expecting a processing result that won't
|
||||
// arrive. To mitigate this, (fake) fail this processing so that the batch is
|
||||
@@ -518,6 +574,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
/// The block processor has completed processing a batch. This function handles the result
|
||||
/// of the batch processor.
|
||||
/// If an error is returned the BackFill sync has failed.
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
#[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"]
|
||||
pub fn on_batch_process_result(
|
||||
&mut self,
|
||||
@@ -530,13 +592,15 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
// result
|
||||
let batch = match &self.current_processing_batch {
|
||||
Some(processing_id) if *processing_id != batch_id => {
|
||||
debug!(self.log, "Unexpected batch result";
|
||||
"batch_epoch" => batch_id, "expected_batch_epoch" => processing_id);
|
||||
debug!(
|
||||
batch_epoch = %batch_id.as_u64(),
|
||||
expected_batch_epoch = processing_id.as_u64(),
|
||||
"Unexpected batch result"
|
||||
);
|
||||
return Ok(ProcessResult::Successful);
|
||||
}
|
||||
None => {
|
||||
debug!(self.log, "Chain was not expecting a batch result";
|
||||
"batch_epoch" => batch_id);
|
||||
debug!(%batch_id, "Chain was not expecting a batch result");
|
||||
return Ok(ProcessResult::Successful);
|
||||
}
|
||||
_ => {
|
||||
@@ -566,8 +630,14 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
return Ok(ProcessResult::Successful);
|
||||
};
|
||||
|
||||
debug!(self.log, "Backfill batch processed"; "result" => ?result, &batch,
|
||||
"batch_epoch" => batch_id, "peer" => %peer, "client" => %network.client_type(peer));
|
||||
debug!(
|
||||
?result,
|
||||
%batch,
|
||||
batch_epoch = %batch_id,
|
||||
%peer,
|
||||
client = %network.client_type(peer),
|
||||
"Backfill batch processed"
|
||||
);
|
||||
|
||||
match result {
|
||||
BatchProcessResult::Success {
|
||||
@@ -591,7 +661,10 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
// check if the chain has completed syncing
|
||||
if self.check_completed() {
|
||||
// chain is completed
|
||||
info!(self.log, "Backfill sync completed"; "blocks_processed" => self.validated_batches * T::EthSpec::slots_per_epoch());
|
||||
info!(
|
||||
blocks_processed = self.validated_batches * T::EthSpec::slots_per_epoch(),
|
||||
"Backfill sync completed"
|
||||
);
|
||||
self.set_state(BackFillState::Completed);
|
||||
Ok(ProcessResult::SyncCompleted)
|
||||
} else {
|
||||
@@ -619,10 +692,9 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
// repeatedly and are either malicious or faulty. We stop the backfill sync and
|
||||
// report all synced peers that have participated.
|
||||
warn!(
|
||||
self.log,
|
||||
"Backfill batch failed to download. Penalizing peers";
|
||||
"score_adjustment" => %penalty,
|
||||
"batch_epoch"=> batch_id
|
||||
score_adjustment = %penalty,
|
||||
batch_epoch = %batch_id,
|
||||
"Backfill batch failed to download. Penalizing peers"
|
||||
);
|
||||
|
||||
for peer in self.participating_peers.drain() {
|
||||
@@ -658,6 +730,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
}
|
||||
|
||||
/// Processes the next ready batch.
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn process_completed_batches(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -692,7 +770,10 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
BatchState::AwaitingValidation(_) => {
|
||||
// TODO: I don't think this state is possible, log a CRIT just in case.
|
||||
// If this is not observed, add it to the failed state branch above.
|
||||
crit!(self.log, "Chain encountered a robust batch awaiting validation"; "batch" => self.processing_target);
|
||||
crit!(
|
||||
batch = ?self.processing_target,
|
||||
"Chain encountered a robust batch awaiting validation"
|
||||
);
|
||||
|
||||
self.processing_target -= BACKFILL_EPOCHS_PER_BATCH;
|
||||
if self.to_be_downloaded >= self.processing_target {
|
||||
@@ -718,6 +799,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
///
|
||||
/// If a previous batch has been validated and it had been re-processed, penalize the original
|
||||
/// peer.
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn advance_chain(&mut self, network: &mut SyncNetworkContext<T>, validating_epoch: Epoch) {
|
||||
// make sure this epoch produces an advancement
|
||||
if validating_epoch >= self.current_start {
|
||||
@@ -745,9 +832,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
// A different peer sent the correct batch, the previous peer did not
|
||||
// We negatively score the original peer.
|
||||
let action = PeerAction::LowToleranceError;
|
||||
debug!(self.log, "Re-processed batch validated. Scoring original peer";
|
||||
"batch_epoch" => id, "score_adjustment" => %action,
|
||||
"original_peer" => %attempt.peer_id, "new_peer" => %processed_attempt.peer_id
|
||||
debug!(
|
||||
batch_epoch = ?id,
|
||||
score_adjustment = %action,
|
||||
original_peer = %attempt.peer_id,
|
||||
new_peer = %processed_attempt.peer_id,
|
||||
"Re-processed batch validated. Scoring original peer"
|
||||
);
|
||||
network.report_peer(
|
||||
attempt.peer_id,
|
||||
@@ -758,9 +848,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
// The same peer corrected it's previous mistake. There was an error, so we
|
||||
// negative score the original peer.
|
||||
let action = PeerAction::MidToleranceError;
|
||||
debug!(self.log, "Re-processed batch validated by the same peer";
|
||||
"batch_epoch" => id, "score_adjustment" => %action,
|
||||
"original_peer" => %attempt.peer_id, "new_peer" => %processed_attempt.peer_id
|
||||
debug!(
|
||||
batch_epoch = ?id,
|
||||
score_adjustment = %action,
|
||||
original_peer = %attempt.peer_id,
|
||||
new_peer = %processed_attempt.peer_id,
|
||||
"Re-processed batch validated by the same peer"
|
||||
);
|
||||
network.report_peer(
|
||||
attempt.peer_id,
|
||||
@@ -778,14 +871,11 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
}
|
||||
}
|
||||
BatchState::Failed | BatchState::Poisoned | BatchState::AwaitingDownload => {
|
||||
crit!(
|
||||
self.log,
|
||||
"batch indicates inconsistent chain state while advancing chain"
|
||||
)
|
||||
crit!("batch indicates inconsistent chain state while advancing chain")
|
||||
}
|
||||
BatchState::AwaitingProcessing(..) => {}
|
||||
BatchState::Processing(_) => {
|
||||
debug!(self.log, "Advancing chain while processing a batch"; "batch" => id, batch);
|
||||
debug!(batch = %id, %batch, "Advancing chain while processing a batch");
|
||||
if let Some(processing_id) = self.current_processing_batch {
|
||||
if id >= processing_id {
|
||||
self.current_processing_batch = None;
|
||||
@@ -803,7 +893,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
// won't have this batch, so we need to request it.
|
||||
self.to_be_downloaded -= BACKFILL_EPOCHS_PER_BATCH;
|
||||
}
|
||||
debug!(self.log, "Backfill advanced"; "validated_epoch" => validating_epoch, "processing_target" => self.processing_target);
|
||||
debug!(?validating_epoch, processing_target = ?self.processing_target, "Backfill advanced");
|
||||
}
|
||||
|
||||
/// An invalid batch has been received that could not be processed, but that can be retried.
|
||||
@@ -811,6 +901,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
/// These events occur when a peer has successfully responded with blocks, but the blocks we
|
||||
/// have received are incorrect or invalid. This indicates the peer has not performed as
|
||||
/// intended and can result in downvoting a peer.
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn handle_invalid_batch(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -862,6 +958,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
}
|
||||
|
||||
/// Sends and registers the request of a batch awaiting download.
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn retry_batch_download(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -896,13 +998,19 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
self.send_batch(network, batch_id, peer)
|
||||
} else {
|
||||
// If we are here the chain has no more synced peers
|
||||
info!(self.log, "Backfill sync paused"; "reason" => "insufficient_synced_peers");
|
||||
info!(reason = "insufficient_synced_peers", "Backfill sync paused");
|
||||
self.set_state(BackFillState::Paused);
|
||||
Err(BackFillError::Paused)
|
||||
}
|
||||
}
|
||||
|
||||
/// Requests the batch assigned to the given id from a given peer.
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn send_batch(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -922,7 +1030,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
if let Err(e) = batch.start_downloading_from_peer(peer, request_id) {
|
||||
return self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0));
|
||||
}
|
||||
debug!(self.log, "Requesting batch"; "epoch" => batch_id, &batch);
|
||||
debug!(epoch = %batch_id, %batch, "Requesting batch");
|
||||
|
||||
// register the batch for this peer
|
||||
self.active_requests
|
||||
@@ -933,8 +1041,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
}
|
||||
Err(e) => {
|
||||
// NOTE: under normal conditions this shouldn't happen but we handle it anyway
|
||||
warn!(self.log, "Could not send batch request";
|
||||
"batch_id" => batch_id, "error" => ?e, &batch);
|
||||
warn!(%batch_id, error = ?e, %batch,"Could not send batch request");
|
||||
// register the failed download and check if the batch can be retried
|
||||
if let Err(e) = batch.start_downloading_from_peer(peer, 1) {
|
||||
return self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0));
|
||||
@@ -963,6 +1070,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
|
||||
/// When resuming a chain, this function searches for batches that need to be re-downloaded and
|
||||
/// transitions their state to redownload the batch.
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn resume_batches(&mut self, network: &mut SyncNetworkContext<T>) -> Result<(), BackFillError> {
|
||||
let batch_ids_to_retry = self
|
||||
.batches
|
||||
@@ -987,6 +1100,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
|
||||
/// Attempts to request the next required batches from the peer pool if the chain is syncing. It will exhaust the peer
|
||||
/// pool and left over batches until the batch buffer is reached or all peers are exhausted.
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn request_batches(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -1029,6 +1148,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
|
||||
/// Creates the next required batch from the chain. If there are no more batches required,
|
||||
/// `false` is returned.
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn include_next_batch(&mut self, network: &mut SyncNetworkContext<T>) -> Option<BatchId> {
|
||||
// don't request batches beyond genesis;
|
||||
if self.last_batch_downloaded {
|
||||
@@ -1090,6 +1215,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
///
|
||||
/// This errors if the beacon chain indicates that backfill sync has already completed or is
|
||||
/// not required.
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn reset_start_epoch(&mut self) -> Result<(), ResetEpochError> {
|
||||
let anchor_info = self.beacon_chain.store.get_anchor_info();
|
||||
if anchor_info.block_backfill_complete(self.beacon_chain.genesis_backfill_slot) {
|
||||
@@ -1103,6 +1234,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
}
|
||||
|
||||
/// Checks with the beacon chain if backfill sync has completed.
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn check_completed(&mut self) -> bool {
|
||||
if self.would_complete(self.current_start) {
|
||||
// Check that the beacon chain agrees
|
||||
@@ -1111,13 +1248,19 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
if anchor_info.block_backfill_complete(self.beacon_chain.genesis_backfill_slot) {
|
||||
return true;
|
||||
} else {
|
||||
error!(self.log, "Backfill out of sync with beacon chain");
|
||||
error!("Backfill out of sync with beacon chain");
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
/// Checks if backfill would complete by syncing to `start_epoch`.
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn would_complete(&self, start_epoch: Epoch) -> bool {
|
||||
start_epoch
|
||||
<= self
|
||||
@@ -1127,10 +1270,22 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
||||
}
|
||||
|
||||
/// Updates the global network state indicating the current state of a backfill sync.
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn set_state(&self, state: BackFillState) {
|
||||
*self.network_globals.backfill_state.write() = state;
|
||||
}
|
||||
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "backfill_sync"),
|
||||
name = "backfill_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn state(&self) -> BackFillState {
|
||||
self.network_globals.backfill_state.read().clone()
|
||||
}
|
||||
|
||||
@@ -41,11 +41,11 @@ use lighthouse_network::service::api_types::SingleLookupReqId;
|
||||
use lighthouse_network::{PeerAction, PeerId};
|
||||
use lru_cache::LRUTimeCache;
|
||||
pub use single_block_lookup::{BlobRequestState, BlockRequestState, CustodyRequestState};
|
||||
use slog::{debug, error, warn, Logger};
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use store::Hash256;
|
||||
use tracing::{debug, error, instrument, warn};
|
||||
use types::{BlobSidecar, DataColumnSidecar, EthSpec, SignedBeaconBlock};
|
||||
|
||||
pub mod common;
|
||||
@@ -116,9 +116,6 @@ pub struct BlockLookups<T: BeaconChainTypes> {
|
||||
|
||||
// TODO: Why not index lookups by block_root?
|
||||
single_block_lookups: FnvHashMap<SingleLookupId, SingleBlockLookup<T>>,
|
||||
|
||||
/// The logger for the import manager.
|
||||
log: Logger,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -130,27 +127,45 @@ use lighthouse_network::service::api_types::Id;
|
||||
pub(crate) type BlockLookupSummary = (Id, Hash256, Option<Hash256>, Vec<PeerId>);
|
||||
|
||||
impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
pub fn new(log: Logger) -> Self {
|
||||
#[instrument(parent = None,level = "info", fields(service = "lookup_sync"), name = "lookup_sync")]
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
failed_chains: LRUTimeCache::new(Duration::from_secs(
|
||||
FAILED_CHAINS_CACHE_EXPIRY_SECONDS,
|
||||
)),
|
||||
single_block_lookups: Default::default(),
|
||||
log,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "lookup_sync"),
|
||||
name = "lookup_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub(crate) fn insert_failed_chain(&mut self, block_root: Hash256) {
|
||||
self.failed_chains.insert(block_root);
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "lookup_sync"),
|
||||
name = "lookup_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub(crate) fn get_failed_chains(&mut self) -> Vec<Hash256> {
|
||||
self.failed_chains.keys().cloned().collect()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "lookup_sync"),
|
||||
name = "lookup_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub(crate) fn active_single_lookups(&self) -> Vec<BlockLookupSummary> {
|
||||
self.single_block_lookups
|
||||
.iter()
|
||||
@@ -159,6 +174,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
}
|
||||
|
||||
/// Returns a vec of all parent lookup chains by tip, in descending slot order (tip first)
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "lookup_sync"),
|
||||
name = "lookup_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub(crate) fn active_parent_lookups(&self) -> Vec<NodeChain> {
|
||||
compute_parent_chains(
|
||||
&self
|
||||
@@ -173,6 +194,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
|
||||
/// Creates a parent lookup for the block with the given `block_root` and immediately triggers it.
|
||||
/// If a parent lookup exists or is triggered, a current lookup will be created.
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "lookup_sync"),
|
||||
name = "lookup_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn search_child_and_parent(
|
||||
&mut self,
|
||||
block_root: Hash256,
|
||||
@@ -202,6 +229,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
|
||||
/// Seach a block whose parent root is unknown.
|
||||
/// Returns true if the lookup is created or already exists
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "lookup_sync"),
|
||||
name = "lookup_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn search_unknown_block(
|
||||
&mut self,
|
||||
block_root: Hash256,
|
||||
@@ -217,6 +250,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
/// - `block_root_to_search` is a failed chain
|
||||
///
|
||||
/// Returns true if the lookup is created or already exists
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "lookup_sync"),
|
||||
name = "lookup_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn search_parent_of_child(
|
||||
&mut self,
|
||||
block_root_to_search: Hash256,
|
||||
@@ -238,7 +277,7 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
if (block_would_extend_chain || trigger_is_chain_tip)
|
||||
&& parent_chain.len() >= PARENT_DEPTH_TOLERANCE
|
||||
{
|
||||
debug!(self.log, "Parent lookup chain too long"; "block_root" => ?block_root_to_search);
|
||||
debug!(block_root = ?block_root_to_search, "Parent lookup chain too long");
|
||||
|
||||
// Searching for this parent would extend a parent chain over the max
|
||||
// Insert the tip only to failed chains
|
||||
@@ -283,9 +322,10 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
});
|
||||
} else {
|
||||
// Should never happen, log error and continue the lookup drop
|
||||
error!(self.log, "Unable to transition lookup to range sync";
|
||||
"error" => "Parent chain tip lookup not found",
|
||||
"block_root" => ?parent_chain_tip
|
||||
error!(
|
||||
error = "Parent chain tip lookup not found",
|
||||
block_root = ?parent_chain_tip,
|
||||
"Unable to transition lookup to range sync"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -299,9 +339,10 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
self.drop_lookup_and_children(*lookup_id);
|
||||
} else {
|
||||
// Should never happen
|
||||
error!(self.log, "Unable to transition lookup to range sync";
|
||||
"error" => "Block to drop lookup not found",
|
||||
"block_root" => ?block_to_drop
|
||||
error!(
|
||||
error = "Block to drop lookup not found",
|
||||
block_root = ?block_to_drop,
|
||||
"Unable to transition lookup to range sync"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -316,6 +357,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
/// Searches for a single block hash. If the blocks parent is unknown, a chain of blocks is
|
||||
/// constructed.
|
||||
/// Returns true if the lookup is created or already exists
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "lookup_sync"),
|
||||
name = "lookup_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn new_current_lookup(
|
||||
&mut self,
|
||||
block_root: Hash256,
|
||||
@@ -326,7 +373,7 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
) -> bool {
|
||||
// If this block or it's parent is part of a known failed chain, ignore it.
|
||||
if self.failed_chains.contains(&block_root) {
|
||||
debug!(self.log, "Block is from a past failed chain. Dropping"; "block_root" => ?block_root);
|
||||
debug!(?block_root, "Block is from a past failed chain. Dropping");
|
||||
for peer_id in peers {
|
||||
cx.report_peer(*peer_id, PeerAction::MidToleranceError, "failed_chain");
|
||||
}
|
||||
@@ -343,12 +390,15 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
let component_type = block_component.get_type();
|
||||
let imported = lookup.add_child_components(block_component);
|
||||
if !imported {
|
||||
debug!(self.log, "Lookup child component ignored"; "block_root" => ?block_root, "type" => component_type);
|
||||
debug!(
|
||||
?block_root,
|
||||
component_type, "Lookup child component ignored"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if let Err(e) = self.add_peers_to_lookup_and_ancestors(lookup_id, peers, cx) {
|
||||
warn!(self.log, "Error adding peers to ancestor lookup"; "error" => ?e);
|
||||
warn!(error = ?e, "Error adding peers to ancestor lookup");
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -361,7 +411,7 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
.iter()
|
||||
.any(|(_, lookup)| lookup.is_for_block(awaiting_parent))
|
||||
{
|
||||
warn!(self.log, "Ignoring child lookup parent lookup not found"; "block_root" => ?awaiting_parent);
|
||||
warn!(block_root = ?awaiting_parent, "Ignoring child lookup parent lookup not found");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -369,7 +419,7 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
// Lookups contain untrusted data, bound the total count of lookups hold in memory to reduce
|
||||
// the risk of OOM in case of bugs of malicious activity.
|
||||
if self.single_block_lookups.len() > MAX_LOOKUPS {
|
||||
warn!(self.log, "Dropping lookup reached max"; "block_root" => ?block_root);
|
||||
warn!(?block_root, "Dropping lookup reached max");
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -387,18 +437,19 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
Entry::Vacant(entry) => entry.insert(lookup),
|
||||
Entry::Occupied(_) => {
|
||||
// Should never happen
|
||||
warn!(self.log, "Lookup exists with same id"; "id" => id);
|
||||
warn!(id, "Lookup exists with same id");
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
debug!(
|
||||
self.log,
|
||||
"Created block lookup";
|
||||
"peer_ids" => ?peers,
|
||||
"block_root" => ?block_root,
|
||||
"awaiting_parent" => awaiting_parent.map(|root| root.to_string()).unwrap_or("none".to_owned()),
|
||||
"id" => lookup.id,
|
||||
?peers,
|
||||
?block_root,
|
||||
awaiting_parent = awaiting_parent
|
||||
.map(|root| root.to_string())
|
||||
.unwrap_or("none".to_owned()),
|
||||
id = lookup.id,
|
||||
"Created block lookup"
|
||||
);
|
||||
metrics::inc_counter(&metrics::SYNC_LOOKUP_CREATED);
|
||||
|
||||
@@ -414,6 +465,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
/* Lookup responses */
|
||||
|
||||
/// Process a block or blob response received from a single lookup request.
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "lookup_sync"),
|
||||
name = "lookup_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn on_download_response<R: RequestState<T>>(
|
||||
&mut self,
|
||||
id: SingleLookupReqId,
|
||||
@@ -437,7 +494,7 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
let Some(lookup) = self.single_block_lookups.get_mut(&id.lookup_id) else {
|
||||
// We don't have the ability to cancel in-flight RPC requests. So this can happen
|
||||
// if we started this RPC request, and later saw the block/blobs via gossip.
|
||||
debug!(self.log, "Block returned for single block lookup not present"; "id" => ?id);
|
||||
debug!(?id, "Block returned for single block lookup not present");
|
||||
return Err(LookupRequestError::UnknownLookup);
|
||||
};
|
||||
|
||||
@@ -448,12 +505,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
|
||||
match response {
|
||||
Ok((response, peer_group, seen_timestamp)) => {
|
||||
debug!(self.log,
|
||||
"Received lookup download success";
|
||||
"block_root" => ?block_root,
|
||||
"id" => ?id,
|
||||
"peer_group" => ?peer_group,
|
||||
"response_type" => ?response_type,
|
||||
debug!(
|
||||
?block_root,
|
||||
?id,
|
||||
?peer_group,
|
||||
?response_type,
|
||||
"Received lookup download success"
|
||||
);
|
||||
|
||||
// Here we could check if response extends a parent chain beyond its max length.
|
||||
@@ -481,12 +538,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
Err(e) => {
|
||||
// No need to log peer source here. When sending a DataColumnsByRoot request we log
|
||||
// the peer and the request ID which is linked to this `id` value here.
|
||||
debug!(self.log,
|
||||
"Received lookup download failure";
|
||||
"block_root" => ?block_root,
|
||||
"id" => ?id,
|
||||
"response_type" => ?response_type,
|
||||
"error" => ?e,
|
||||
debug!(
|
||||
?block_root,
|
||||
?id,
|
||||
?response_type,
|
||||
error = ?e,
|
||||
"Received lookup download failure"
|
||||
);
|
||||
|
||||
request_state.on_download_failure(id.req_id)?;
|
||||
@@ -499,6 +556,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
|
||||
/* Error responses */
|
||||
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "lookup_sync"),
|
||||
name = "lookup_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn peer_disconnected(&mut self, peer_id: &PeerId) {
|
||||
for (_, lookup) in self.single_block_lookups.iter_mut() {
|
||||
lookup.remove_peer(peer_id);
|
||||
@@ -507,6 +570,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
|
||||
/* Processing responses */
|
||||
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "lookup_sync"),
|
||||
name = "lookup_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn on_processing_result(
|
||||
&mut self,
|
||||
process_type: BlockProcessType,
|
||||
@@ -527,6 +596,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
self.on_lookup_result(process_type.id(), lookup_result, "processing_result", cx);
|
||||
}
|
||||
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "lookup_sync"),
|
||||
name = "lookup_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn on_processing_result_inner<R: RequestState<T>>(
|
||||
&mut self,
|
||||
lookup_id: SingleLookupId,
|
||||
@@ -534,7 +609,7 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
cx: &mut SyncNetworkContext<T>,
|
||||
) -> Result<LookupResult, LookupRequestError> {
|
||||
let Some(lookup) = self.single_block_lookups.get_mut(&lookup_id) else {
|
||||
debug!(self.log, "Unknown single block lookup"; "id" => lookup_id);
|
||||
debug!(id = lookup_id, "Unknown single block lookup");
|
||||
return Err(LookupRequestError::UnknownLookup);
|
||||
};
|
||||
|
||||
@@ -544,12 +619,11 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
.get_state_mut();
|
||||
|
||||
debug!(
|
||||
self.log,
|
||||
"Received lookup processing result";
|
||||
"component" => ?R::response_type(),
|
||||
"block_root" => ?block_root,
|
||||
"id" => lookup_id,
|
||||
"result" => ?result,
|
||||
component = ?R::response_type(),
|
||||
?block_root,
|
||||
id = lookup_id,
|
||||
?result,
|
||||
"Received lookup processing result"
|
||||
);
|
||||
|
||||
let action = match result {
|
||||
@@ -581,20 +655,15 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
BlockProcessingResult::Err(BlockError::DuplicateImportStatusUnknown(..)) => {
|
||||
// This is unreachable because RPC blocks do not undergo gossip verification, and
|
||||
// this error can *only* come from gossip verification.
|
||||
error!(
|
||||
self.log,
|
||||
"Single block lookup hit unreachable condition";
|
||||
"block_root" => ?block_root
|
||||
);
|
||||
error!(?block_root, "Single block lookup hit unreachable condition");
|
||||
Action::Drop
|
||||
}
|
||||
BlockProcessingResult::Ignored => {
|
||||
// Beacon processor signalled to ignore the block processing result.
|
||||
// This implies that the cpu is overloaded. Drop the request.
|
||||
warn!(
|
||||
self.log,
|
||||
"Lookup component processing ignored, cpu might be overloaded";
|
||||
"component" => ?R::response_type(),
|
||||
component = ?R::response_type(),
|
||||
"Lookup component processing ignored, cpu might be overloaded"
|
||||
);
|
||||
Action::Drop
|
||||
}
|
||||
@@ -602,7 +671,7 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
match e {
|
||||
BlockError::BeaconChainError(e) => {
|
||||
// Internal error
|
||||
error!(self.log, "Beacon chain error processing lookup component"; "block_root" => %block_root, "error" => ?e);
|
||||
error!(%block_root, error = ?e, "Beacon chain error processing lookup component");
|
||||
Action::Drop
|
||||
}
|
||||
BlockError::ParentUnknown { parent_root, .. } => {
|
||||
@@ -618,10 +687,9 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
// These errors indicate that the execution layer is offline
|
||||
// and failed to validate the execution payload. Do not downscore peer.
|
||||
debug!(
|
||||
self.log,
|
||||
"Single block lookup failed. Execution layer is offline / unsynced / misconfigured";
|
||||
"block_root" => ?block_root,
|
||||
"error" => ?e
|
||||
?block_root,
|
||||
error = ?e,
|
||||
"Single block lookup failed. Execution layer is offline / unsynced / misconfigured"
|
||||
);
|
||||
Action::Drop
|
||||
}
|
||||
@@ -629,7 +697,7 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
if e.category() == AvailabilityCheckErrorCategory::Internal =>
|
||||
{
|
||||
// There errors indicate internal problems and should not downscore the peer
|
||||
warn!(self.log, "Internal availability check failure"; "block_root" => ?block_root, "error" => ?e);
|
||||
warn!(?block_root, error = ?e, "Internal availability check failure");
|
||||
|
||||
// Here we choose *not* to call `on_processing_failure` because this could result in a bad
|
||||
// lookup state transition. This error invalidates both blob and block requests, and we don't know the
|
||||
@@ -638,7 +706,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
Action::Drop
|
||||
}
|
||||
other => {
|
||||
debug!(self.log, "Invalid lookup component"; "block_root" => ?block_root, "component" => ?R::response_type(), "error" => ?other);
|
||||
debug!(
|
||||
?block_root,
|
||||
component = ?R::response_type(),
|
||||
error = ?other,
|
||||
"Invalid lookup component"
|
||||
);
|
||||
let peer_group = request_state.on_processing_failure()?;
|
||||
let peers_to_penalize: Vec<_> = match other {
|
||||
// Note: currenlty only InvalidColumn errors have index granularity,
|
||||
@@ -685,7 +758,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
Action::ParentUnknown { parent_root } => {
|
||||
let peers = lookup.all_peers();
|
||||
lookup.set_awaiting_parent(parent_root);
|
||||
debug!(self.log, "Marking lookup as awaiting parent"; "id" => lookup.id, "block_root" => ?block_root, "parent_root" => ?parent_root);
|
||||
debug!(
|
||||
id = lookup.id,
|
||||
?block_root,
|
||||
?parent_root,
|
||||
"Marking lookup as awaiting parent"
|
||||
);
|
||||
self.search_parent_of_child(parent_root, block_root, &peers, cx);
|
||||
Ok(LookupResult::Pending)
|
||||
}
|
||||
@@ -700,6 +778,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "lookup_sync"),
|
||||
name = "lookup_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn on_external_processing_result(
|
||||
&mut self,
|
||||
block_root: Hash256,
|
||||
@@ -725,13 +809,24 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
}
|
||||
|
||||
/// Makes progress on the immediate children of `block_root`
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "lookup_sync"),
|
||||
name = "lookup_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn continue_child_lookups(&mut self, block_root: Hash256, cx: &mut SyncNetworkContext<T>) {
|
||||
let mut lookup_results = vec![]; // < need to buffer lookup results to not re-borrow &mut self
|
||||
|
||||
for (id, lookup) in self.single_block_lookups.iter_mut() {
|
||||
if lookup.awaiting_parent() == Some(block_root) {
|
||||
lookup.resolve_awaiting_parent();
|
||||
debug!(self.log, "Continuing child lookup"; "parent_root" => ?block_root, "id" => id, "block_root" => ?lookup.block_root());
|
||||
debug!(
|
||||
parent_root = ?block_root,
|
||||
id,
|
||||
block_root = ?lookup.block_root(),
|
||||
"Continuing child lookup"
|
||||
);
|
||||
let result = lookup.continue_requests(cx);
|
||||
lookup_results.push((*id, result));
|
||||
}
|
||||
@@ -745,12 +840,19 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
/// Drops `dropped_id` lookup and all its children recursively. Lookups awaiting a parent need
|
||||
/// the parent to make progress to resolve, therefore we must drop them if the parent is
|
||||
/// dropped.
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "lookup_sync"),
|
||||
name = "lookup_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn drop_lookup_and_children(&mut self, dropped_id: SingleLookupId) {
|
||||
if let Some(dropped_lookup) = self.single_block_lookups.remove(&dropped_id) {
|
||||
debug!(self.log, "Dropping lookup";
|
||||
"id" => ?dropped_id,
|
||||
"block_root" => ?dropped_lookup.block_root(),
|
||||
"awaiting_parent" => ?dropped_lookup.awaiting_parent(),
|
||||
debug!(
|
||||
id = ?dropped_id,
|
||||
block_root = ?dropped_lookup.block_root(),
|
||||
awaiting_parent = ?dropped_lookup.awaiting_parent(),
|
||||
"Dropping lookup"
|
||||
);
|
||||
|
||||
let child_lookups = self
|
||||
@@ -768,6 +870,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
|
||||
/// Common handler a lookup request error, drop it and update metrics
|
||||
/// Returns true if the lookup is created or already exists
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "lookup_sync"),
|
||||
name = "lookup_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn on_lookup_result(
|
||||
&mut self,
|
||||
id: SingleLookupId,
|
||||
@@ -779,13 +887,13 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
Ok(LookupResult::Pending) => true, // no action
|
||||
Ok(LookupResult::Completed) => {
|
||||
if let Some(lookup) = self.single_block_lookups.remove(&id) {
|
||||
debug!(self.log, "Dropping completed lookup"; "block" => ?lookup.block_root(), "id" => id);
|
||||
debug!(block = ?lookup.block_root(), id, "Dropping completed lookup");
|
||||
metrics::inc_counter(&metrics::SYNC_LOOKUP_COMPLETED);
|
||||
// Block imported, continue the requests of pending child blocks
|
||||
self.continue_child_lookups(lookup.block_root(), cx);
|
||||
self.update_metrics();
|
||||
} else {
|
||||
debug!(self.log, "Attempting to drop non-existent lookup"; "id" => id);
|
||||
debug!(id, "Attempting to drop non-existent lookup");
|
||||
}
|
||||
false
|
||||
}
|
||||
@@ -793,7 +901,7 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
// update metrics because the lookup does not exist.
|
||||
Err(LookupRequestError::UnknownLookup) => false,
|
||||
Err(error) => {
|
||||
debug!(self.log, "Dropping lookup on request error"; "id" => id, "source" => source, "error" => ?error);
|
||||
debug!(id, source, ?error, "Dropping lookup on request error");
|
||||
metrics::inc_counter_vec(&metrics::SYNC_LOOKUP_DROPPED, &[error.into()]);
|
||||
self.drop_lookup_and_children(id);
|
||||
self.update_metrics();
|
||||
@@ -805,12 +913,24 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
/* Helper functions */
|
||||
|
||||
/// Drops all the single block requests and returns how many requests were dropped.
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "lookup_sync"),
|
||||
name = "lookup_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn drop_single_block_requests(&mut self) -> usize {
|
||||
let requests_to_drop = self.single_block_lookups.len();
|
||||
self.single_block_lookups.clear();
|
||||
requests_to_drop
|
||||
}
|
||||
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "lookup_sync"),
|
||||
name = "lookup_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn update_metrics(&self) {
|
||||
metrics::set_gauge(
|
||||
&metrics::SYNC_SINGLE_BLOCK_LOOKUPS,
|
||||
@@ -819,6 +939,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
}
|
||||
|
||||
/// Perform some prune operations on lookups on some interval
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "lookup_sync"),
|
||||
name = "lookup_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn prune_lookups(&mut self) {
|
||||
self.drop_lookups_without_peers();
|
||||
self.drop_stuck_lookups();
|
||||
@@ -842,6 +968,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
///
|
||||
/// Instead there's no negative for keeping lookups with no peers around for some time. If we
|
||||
/// regularly prune them, it should not be a memory concern (TODO: maybe yes!).
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "lookup_sync"),
|
||||
name = "lookup_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn drop_lookups_without_peers(&mut self) {
|
||||
for (lookup_id, block_root) in self
|
||||
.single_block_lookups
|
||||
@@ -857,9 +989,10 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
.map(|lookup| (lookup.id, lookup.block_root()))
|
||||
.collect::<Vec<_>>()
|
||||
{
|
||||
debug!(self.log, "Dropping lookup with no peers";
|
||||
"id" => lookup_id,
|
||||
"block_root" => ?block_root
|
||||
debug!(
|
||||
id = lookup_id,
|
||||
%block_root,
|
||||
"Dropping lookup with no peers"
|
||||
);
|
||||
self.drop_lookup_and_children(lookup_id);
|
||||
}
|
||||
@@ -878,6 +1011,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
///
|
||||
/// - One single clear warn level log per stuck incident
|
||||
/// - If the original bug is sporadic, it reduces the time a node is stuck from forever to 15 min
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "lookup_sync"),
|
||||
name = "lookup_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn drop_stuck_lookups(&mut self) {
|
||||
// While loop to find and drop all disjoint trees of potentially stuck lookups.
|
||||
while let Some(stuck_lookup) = self.single_block_lookups.values().find(|lookup| {
|
||||
@@ -886,7 +1025,7 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
let ancestor_stuck_lookup = match self.find_oldest_ancestor_lookup(stuck_lookup) {
|
||||
Ok(lookup) => lookup,
|
||||
Err(e) => {
|
||||
warn!(self.log, "Error finding oldest ancestor lookup"; "error" => ?e);
|
||||
warn!(error = ?e,"Error finding oldest ancestor lookup");
|
||||
// Default to dropping the lookup that exceeds the max duration so at least
|
||||
// eventually sync should be unstuck
|
||||
stuck_lookup
|
||||
@@ -894,16 +1033,18 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
};
|
||||
|
||||
if stuck_lookup.id == ancestor_stuck_lookup.id {
|
||||
warn!(self.log, "Notify the devs a sync lookup is stuck";
|
||||
"block_root" => ?stuck_lookup.block_root(),
|
||||
"lookup" => ?stuck_lookup,
|
||||
warn!(
|
||||
block_root = ?stuck_lookup.block_root(),
|
||||
lookup = ?stuck_lookup,
|
||||
"Notify the devs a sync lookup is stuck"
|
||||
);
|
||||
} else {
|
||||
warn!(self.log, "Notify the devs a sync lookup is stuck";
|
||||
"block_root" => ?stuck_lookup.block_root(),
|
||||
"lookup" => ?stuck_lookup,
|
||||
"ancestor_block_root" => ?ancestor_stuck_lookup.block_root(),
|
||||
"ancestor_lookup" => ?ancestor_stuck_lookup,
|
||||
warn!(
|
||||
block_root = ?stuck_lookup.block_root(),
|
||||
lookup = ?stuck_lookup,
|
||||
ancestor_block_root = ?ancestor_stuck_lookup.block_root(),
|
||||
ancestor_lookup = ?ancestor_stuck_lookup,
|
||||
"Notify the devs a sync lookup is stuck"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -913,6 +1054,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
}
|
||||
|
||||
/// Recursively find the oldest ancestor lookup of another lookup
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "lookup_sync"),
|
||||
name = "lookup_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn find_oldest_ancestor_lookup<'a>(
|
||||
&'a self,
|
||||
lookup: &'a SingleBlockLookup<T>,
|
||||
@@ -937,6 +1084,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
/// Adds peers to a lookup and its ancestors recursively.
|
||||
/// Note: Takes a `lookup_id` as argument to allow recursion on mutable lookups, without having
|
||||
/// to duplicate the code to add peers to a lookup
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "lookup_sync"),
|
||||
name = "lookup_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn add_peers_to_lookup_and_ancestors(
|
||||
&mut self,
|
||||
lookup_id: SingleLookupId,
|
||||
@@ -952,9 +1105,10 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
|
||||
for peer in peers {
|
||||
if lookup.add_peer(*peer) {
|
||||
added_some_peer = true;
|
||||
debug!(self.log, "Adding peer to existing single block lookup";
|
||||
"block_root" => ?lookup.block_root(),
|
||||
"peer" => ?peer
|
||||
debug!(
|
||||
block_root = ?lookup.block_root(),
|
||||
?peer,
|
||||
"Adding peer to existing single block lookup"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -63,12 +63,13 @@ use lighthouse_network::service::api_types::{
|
||||
use lighthouse_network::types::{NetworkGlobals, SyncState};
|
||||
use lighthouse_network::SyncInfo;
|
||||
use lighthouse_network::{PeerAction, PeerId};
|
||||
use logging::crit;
|
||||
use lru_cache::LRUTimeCache;
|
||||
use slog::{crit, debug, error, info, o, trace, warn, Logger};
|
||||
use std::ops::Sub;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::sync::mpsc;
|
||||
use tracing::{debug, error, info, info_span, trace, warn, Instrument};
|
||||
use types::{
|
||||
BlobSidecar, DataColumnSidecar, EthSpec, ForkContext, Hash256, SignedBeaconBlock, Slot,
|
||||
};
|
||||
@@ -246,9 +247,6 @@ pub struct SyncManager<T: BeaconChainTypes> {
|
||||
notified_unknown_roots: LRUTimeCache<(PeerId, Hash256)>,
|
||||
|
||||
sampling: Sampling<T>,
|
||||
|
||||
/// The logger for the import manager.
|
||||
log: Logger,
|
||||
}
|
||||
|
||||
/// Spawns a new `SyncManager` thread which has a weak reference to underlying beacon
|
||||
@@ -261,7 +259,6 @@ pub fn spawn<T: BeaconChainTypes>(
|
||||
beacon_processor: Arc<NetworkBeaconProcessor<T>>,
|
||||
sync_recv: mpsc::UnboundedReceiver<SyncMessage<T::EthSpec>>,
|
||||
fork_context: Arc<ForkContext>,
|
||||
log: slog::Logger,
|
||||
) {
|
||||
assert!(
|
||||
beacon_chain.spec.max_request_blocks(fork_context.current_fork()) as u64 >= T::EthSpec::slots_per_epoch() * EPOCHS_PER_BATCH,
|
||||
@@ -276,12 +273,18 @@ pub fn spawn<T: BeaconChainTypes>(
|
||||
sync_recv,
|
||||
SamplingConfig::Default,
|
||||
fork_context,
|
||||
log.clone(),
|
||||
);
|
||||
|
||||
// spawn the sync manager thread
|
||||
debug!(log, "Sync Manager started");
|
||||
executor.spawn(async move { Box::pin(sync_manager.main()).await }, "sync");
|
||||
debug!("Sync Manager started");
|
||||
executor.spawn(
|
||||
async move {
|
||||
Box::pin(sync_manager.main())
|
||||
.instrument(info_span!("", service = "sync"))
|
||||
.await
|
||||
},
|
||||
"sync",
|
||||
);
|
||||
}
|
||||
|
||||
impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
@@ -292,7 +295,6 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
sync_recv: mpsc::UnboundedReceiver<SyncMessage<T::EthSpec>>,
|
||||
sampling_config: SamplingConfig,
|
||||
fork_context: Arc<ForkContext>,
|
||||
log: slog::Logger,
|
||||
) -> Self {
|
||||
let network_globals = beacon_processor.network_globals.clone();
|
||||
Self {
|
||||
@@ -303,23 +305,14 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
beacon_processor.clone(),
|
||||
beacon_chain.clone(),
|
||||
fork_context.clone(),
|
||||
log.clone(),
|
||||
),
|
||||
range_sync: RangeSync::new(
|
||||
beacon_chain.clone(),
|
||||
log.new(o!("service" => "range_sync")),
|
||||
),
|
||||
backfill_sync: BackFillSync::new(
|
||||
beacon_chain.clone(),
|
||||
network_globals,
|
||||
log.new(o!("service" => "backfill_sync")),
|
||||
),
|
||||
block_lookups: BlockLookups::new(log.new(o!("service"=> "lookup_sync"))),
|
||||
range_sync: RangeSync::new(beacon_chain.clone()),
|
||||
backfill_sync: BackFillSync::new(beacon_chain.clone(), network_globals),
|
||||
block_lookups: BlockLookups::new(),
|
||||
notified_unknown_roots: LRUTimeCache::new(Duration::from_secs(
|
||||
NOTIFIED_UNKNOWN_ROOT_EXPIRY_SECONDS,
|
||||
)),
|
||||
sampling: Sampling::new(sampling_config, log.new(o!("service" => "sampling"))),
|
||||
log: log.clone(),
|
||||
sampling: Sampling::new(sampling_config),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -461,10 +454,10 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
};
|
||||
|
||||
let head_slot = head_slot.unwrap_or_else(|| {
|
||||
debug!(self.log,
|
||||
"On add peers force range sync assuming local head_slot";
|
||||
"local_head_slot" => local.head_slot,
|
||||
"head_root" => ?head_root
|
||||
debug!(
|
||||
local_head_slot = %local.head_slot,
|
||||
?head_root,
|
||||
"On add peers force range sync assuming local head_slot"
|
||||
);
|
||||
local.head_slot
|
||||
});
|
||||
@@ -485,7 +478,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
|
||||
/// Handles RPC errors related to requests that were emitted from the sync manager.
|
||||
fn inject_error(&mut self, peer_id: PeerId, request_id: SyncRequestId, error: RPCError) {
|
||||
trace!(self.log, "Sync manager received a failed RPC");
|
||||
trace!("Sync manager received a failed RPC");
|
||||
match request_id {
|
||||
SyncRequestId::SingleBlock { id } => {
|
||||
self.on_single_block_response(id, peer_id, RpcEvent::RPCError(error))
|
||||
@@ -565,15 +558,14 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
let is_connected = self.network_globals().peers.read().is_connected(peer_id);
|
||||
if was_updated {
|
||||
debug!(
|
||||
self.log,
|
||||
"Peer transitioned sync state";
|
||||
"peer_id" => %peer_id,
|
||||
"new_state" => rpr,
|
||||
"our_head_slot" => local_sync_info.head_slot,
|
||||
"our_finalized_epoch" => local_sync_info.finalized_epoch,
|
||||
"their_head_slot" => remote_sync_info.head_slot,
|
||||
"their_finalized_epoch" => remote_sync_info.finalized_epoch,
|
||||
"is_connected" => is_connected
|
||||
%peer_id,
|
||||
new_state = rpr,
|
||||
our_head_slot = %local_sync_info.head_slot,
|
||||
our_finalized_epoch = %local_sync_info.finalized_epoch,
|
||||
their_head_slot = %remote_sync_info.head_slot,
|
||||
their_finalized_epoch = %remote_sync_info.finalized_epoch,
|
||||
is_connected,
|
||||
"Peer transitioned sync state"
|
||||
);
|
||||
|
||||
// A peer has transitioned its sync state. If the new state is "synced" we
|
||||
@@ -584,7 +576,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
}
|
||||
is_connected
|
||||
} else {
|
||||
error!(self.log, "Status'd peer is unknown"; "peer_id" => %peer_id);
|
||||
error!(%peer_id, "Status'd peer is unknown");
|
||||
false
|
||||
}
|
||||
}
|
||||
@@ -603,7 +595,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
fn update_sync_state(&mut self) {
|
||||
let new_state: SyncState = match self.range_sync.state() {
|
||||
Err(e) => {
|
||||
crit!(self.log, "Error getting range sync state"; "error" => %e);
|
||||
crit!(error = %e, "Error getting range sync state");
|
||||
return;
|
||||
}
|
||||
Ok(state) => match state {
|
||||
@@ -652,7 +644,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
}
|
||||
Ok(SyncStart::NotSyncing) => {} // Ignore updating the state if the backfill sync state didn't start.
|
||||
Err(e) => {
|
||||
error!(self.log, "Backfill sync failed to start"; "error" => ?e);
|
||||
error!(error = ?e, "Backfill sync failed to start");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -686,7 +678,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
let old_state = self.network_globals().set_sync_state(new_state);
|
||||
let new_state = self.network_globals().sync_state.read().clone();
|
||||
if !new_state.eq(&old_state) {
|
||||
info!(self.log, "Sync state updated"; "old_state" => %old_state, "new_state" => %new_state);
|
||||
info!(%old_state, %new_state, "Sync state updated");
|
||||
// If we have become synced - Subscribe to all the core subnet topics
|
||||
// We don't need to subscribe if the old state is a state that would have already
|
||||
// invoked this call.
|
||||
@@ -781,7 +773,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
SyncMessage::UnknownParentBlock(peer_id, block, block_root) => {
|
||||
let block_slot = block.slot();
|
||||
let parent_root = block.parent_root();
|
||||
debug!(self.log, "Received unknown parent block message"; "block_root" => %block_root, "parent_root" => %parent_root);
|
||||
debug!(%block_root, %parent_root, "Received unknown parent block message");
|
||||
self.handle_unknown_parent(
|
||||
peer_id,
|
||||
block_root,
|
||||
@@ -799,7 +791,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
let blob_slot = blob.slot();
|
||||
let block_root = blob.block_root();
|
||||
let parent_root = blob.block_parent_root();
|
||||
debug!(self.log, "Received unknown parent blob message"; "block_root" => %block_root, "parent_root" => %parent_root);
|
||||
debug!(%block_root, %parent_root, "Received unknown parent blob message");
|
||||
self.handle_unknown_parent(
|
||||
peer_id,
|
||||
block_root,
|
||||
@@ -817,7 +809,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
let data_column_slot = data_column.slot();
|
||||
let block_root = data_column.block_root();
|
||||
let parent_root = data_column.block_parent_root();
|
||||
debug!(self.log, "Received unknown parent data column message"; "block_root" => %block_root, "parent_root" => %parent_root);
|
||||
debug!(%block_root, %parent_root, "Received unknown parent data column message");
|
||||
self.handle_unknown_parent(
|
||||
peer_id,
|
||||
block_root,
|
||||
@@ -834,12 +826,12 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
SyncMessage::UnknownBlockHashFromAttestation(peer_id, block_root) => {
|
||||
if !self.notified_unknown_roots.contains(&(peer_id, block_root)) {
|
||||
self.notified_unknown_roots.insert((peer_id, block_root));
|
||||
debug!(self.log, "Received unknown block hash message"; "block_root" => ?block_root, "peer" => ?peer_id);
|
||||
debug!(?block_root, ?peer_id, "Received unknown block hash message");
|
||||
self.handle_unknown_block_root(peer_id, block_root);
|
||||
}
|
||||
}
|
||||
SyncMessage::SampleBlock(block_root, block_slot) => {
|
||||
debug!(self.log, "Received SampleBlock message"; "block_root" => %block_root, "slot" => block_slot);
|
||||
debug!(%block_root, slot = %block_slot, "Received SampleBlock message");
|
||||
if let Some((requester, result)) = self
|
||||
.sampling
|
||||
.on_new_sample_request(block_root, &mut self.network)
|
||||
@@ -848,7 +840,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
}
|
||||
}
|
||||
SyncMessage::Disconnect(peer_id) => {
|
||||
debug!(self.log, "Received disconnected message"; "peer_id" => %peer_id);
|
||||
debug!(%peer_id, "Received disconnected message");
|
||||
self.peer_disconnect(&peer_id);
|
||||
}
|
||||
SyncMessage::RpcError {
|
||||
@@ -889,7 +881,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
Ok(ProcessResult::Successful) => {}
|
||||
Ok(ProcessResult::SyncCompleted) => self.update_sync_state(),
|
||||
Err(error) => {
|
||||
error!(self.log, "Backfill sync failed"; "error" => ?error);
|
||||
error!(error = ?error, "Backfill sync failed");
|
||||
// Update the global status
|
||||
self.update_sync_state();
|
||||
}
|
||||
@@ -925,7 +917,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
);
|
||||
}
|
||||
Err(reason) => {
|
||||
debug!(self.log, "Ignoring unknown parent request"; "block_root" => %block_root, "parent_root" => %parent_root, "reason" => reason);
|
||||
debug!(%block_root, %parent_root, reason, "Ignoring unknown parent request");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -937,7 +929,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
.search_unknown_block(block_root, &[peer_id], &mut self.network);
|
||||
}
|
||||
Err(reason) => {
|
||||
debug!(self.log, "Ignoring unknown block request"; "block_root" => %block_root, "reason" => reason);
|
||||
debug!(%block_root, reason, "Ignoring unknown block request");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1015,8 +1007,9 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
|
||||
// Some logs.
|
||||
if dropped_single_blocks_requests > 0 {
|
||||
debug!(self.log, "Execution engine not online. Dropping active requests.";
|
||||
"dropped_single_blocks_requests" => dropped_single_blocks_requests,
|
||||
debug!(
|
||||
dropped_single_blocks_requests,
|
||||
"Execution engine not online. Dropping active requests."
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1042,7 +1035,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
RpcEvent::from_chunk(block, seen_timestamp),
|
||||
),
|
||||
_ => {
|
||||
crit!(self.log, "bad request id for block"; "peer_id" => %peer_id );
|
||||
crit!(%peer_id, "bad request id for block");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1084,7 +1077,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
RpcEvent::from_chunk(blob, seen_timestamp),
|
||||
),
|
||||
_ => {
|
||||
crit!(self.log, "bad request id for blob"; "peer_id" => %peer_id);
|
||||
crit!(%peer_id, "bad request id for blob");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1110,7 +1103,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
RpcEvent::from_chunk(data_column, seen_timestamp),
|
||||
),
|
||||
_ => {
|
||||
crit!(self.log, "bad request id for data_column"; "peer_id" => %peer_id);
|
||||
crit!(%peer_id, "bad request id for data_column");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1228,7 +1221,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
fn on_sampling_result(&mut self, requester: SamplingRequester, result: SamplingResult) {
|
||||
match requester {
|
||||
SamplingRequester::ImportedBlock(block_root) => {
|
||||
debug!(self.log, "Sampling result"; "block_root" => %block_root, "result" => ?result);
|
||||
debug!(%block_root, ?result, "Sampling result");
|
||||
|
||||
match result {
|
||||
Ok(_) => {
|
||||
@@ -1239,11 +1232,11 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
.beacon_processor()
|
||||
.send_sampling_completed(block_root)
|
||||
{
|
||||
warn!(self.log, "Error sending sampling result"; "block_root" => ?block_root, "reason" => ?e);
|
||||
warn!(?block_root, reason = ?e, "Error sending sampling result");
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(self.log, "Sampling failed"; "block_root" => %block_root, "reason" => ?e);
|
||||
warn!(?block_root, reason = ?e, "Sampling failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -34,13 +34,13 @@ use requests::{
|
||||
ActiveRequests, BlobsByRangeRequestItems, BlobsByRootRequestItems, BlocksByRangeRequestItems,
|
||||
BlocksByRootRequestItems, DataColumnsByRangeRequestItems, DataColumnsByRootRequestItems,
|
||||
};
|
||||
use slog::{debug, error, warn};
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::fmt::Debug;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::sync::mpsc;
|
||||
use tracing::{debug, error, span, warn, Level};
|
||||
use types::blob_sidecar::FixedBlobSidecarList;
|
||||
use types::{
|
||||
BlobSidecar, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, EthSpec, ForkContext,
|
||||
@@ -74,10 +74,10 @@ pub type CustodyByRootResult<T> =
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum RpcResponseError {
|
||||
RpcError(RPCError),
|
||||
RpcError(#[allow(dead_code)] RPCError),
|
||||
VerifyError(LookupVerifyError),
|
||||
CustodyRequestError(CustodyRequestError),
|
||||
BlockComponentCouplingError(String),
|
||||
CustodyRequestError(#[allow(dead_code)] CustodyRequestError),
|
||||
BlockComponentCouplingError(#[allow(dead_code)] String),
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
@@ -89,6 +89,19 @@ pub enum RpcRequestSendError {
|
||||
SlotClockError,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for RpcRequestSendError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
match self {
|
||||
RpcRequestSendError::NetworkSendError => write!(f, "Network send error"),
|
||||
RpcRequestSendError::NoCustodyPeers => write!(f, "No custody peers"),
|
||||
RpcRequestSendError::CustodyRequestError(e) => {
|
||||
write!(f, "Custody request error: {:?}", e)
|
||||
}
|
||||
RpcRequestSendError::SlotClockError => write!(f, "Slot clock error"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub enum SendErrorProcessor {
|
||||
SendError,
|
||||
@@ -201,9 +214,6 @@ pub struct SyncNetworkContext<T: BeaconChainTypes> {
|
||||
pub chain: Arc<BeaconChain<T>>,
|
||||
|
||||
fork_context: Arc<ForkContext>,
|
||||
|
||||
/// Logger for the `SyncNetworkContext`.
|
||||
pub log: slog::Logger,
|
||||
}
|
||||
|
||||
/// Small enumeration to make dealing with block and blob requests easier.
|
||||
@@ -219,8 +229,13 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
network_beacon_processor: Arc<NetworkBeaconProcessor<T>>,
|
||||
chain: Arc<BeaconChain<T>>,
|
||||
fork_context: Arc<ForkContext>,
|
||||
log: slog::Logger,
|
||||
) -> Self {
|
||||
let span = span!(
|
||||
Level::INFO,
|
||||
"SyncNetworkContext",
|
||||
service = "network_context"
|
||||
);
|
||||
let _enter = span.enter();
|
||||
SyncNetworkContext {
|
||||
network_send,
|
||||
execution_engine_state: EngineState::Online, // always assume `Online` at the start
|
||||
@@ -236,7 +251,6 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
network_beacon_processor,
|
||||
chain,
|
||||
fork_context,
|
||||
log,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -267,7 +281,6 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
network_beacon_processor: _,
|
||||
chain: _,
|
||||
fork_context: _,
|
||||
log: _,
|
||||
} = self;
|
||||
|
||||
let blocks_by_root_ids = blocks_by_root_requests
|
||||
@@ -330,17 +343,23 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
}
|
||||
|
||||
pub fn status_peers<C: ToStatusMessage>(&self, chain: &C, peers: impl Iterator<Item = PeerId>) {
|
||||
let span = span!(
|
||||
Level::INFO,
|
||||
"SyncNetworkContext",
|
||||
service = "network_context"
|
||||
);
|
||||
let _enter = span.enter();
|
||||
|
||||
let status_message = chain.status_message();
|
||||
for peer_id in peers {
|
||||
debug!(
|
||||
self.log,
|
||||
"Sending Status Request";
|
||||
"peer" => %peer_id,
|
||||
"fork_digest" => ?status_message.fork_digest,
|
||||
"finalized_root" => ?status_message.finalized_root,
|
||||
"finalized_epoch" => ?status_message.finalized_epoch,
|
||||
"head_root" => %status_message.head_root,
|
||||
"head_slot" => %status_message.head_slot,
|
||||
peer = %peer_id,
|
||||
fork_digest = ?status_message.fork_digest,
|
||||
finalized_root = ?status_message.finalized_root,
|
||||
finalized_epoch = ?status_message.finalized_epoch,
|
||||
head_root = %status_message.head_root,
|
||||
head_slot = %status_message.head_slot,
|
||||
"Sending Status Request"
|
||||
);
|
||||
|
||||
let request = RequestType::Status(status_message.clone());
|
||||
@@ -385,7 +404,6 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
let (expects_columns, data_column_requests) =
|
||||
if matches!(batch_type, ByRangeRequestType::BlocksAndColumns) {
|
||||
let column_indexes = self.network_globals().sampling_columns.clone();
|
||||
|
||||
let data_column_requests = self
|
||||
.make_columns_by_range_requests(request, &column_indexes)?
|
||||
.into_iter()
|
||||
@@ -518,6 +536,13 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
return Ok(LookupRequestResult::Pending("no peers"));
|
||||
};
|
||||
|
||||
let span = span!(
|
||||
Level::INFO,
|
||||
"SyncNetworkContext",
|
||||
service = "network_context"
|
||||
);
|
||||
let _enter = span.enter();
|
||||
|
||||
match self.chain.get_block_process_status(&block_root) {
|
||||
// Unknown block, continue request to download
|
||||
BlockProcessStatus::Unknown => {}
|
||||
@@ -560,12 +585,11 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
.map_err(|_| RpcRequestSendError::NetworkSendError)?;
|
||||
|
||||
debug!(
|
||||
self.log,
|
||||
"Sync RPC request sent";
|
||||
"method" => "BlocksByRoot",
|
||||
"block_root" => ?block_root,
|
||||
"peer" => %peer_id,
|
||||
"id" => %id
|
||||
method = "BlocksByRoot",
|
||||
?block_root,
|
||||
peer = %peer_id,
|
||||
%id,
|
||||
"Sync RPC request sent"
|
||||
);
|
||||
|
||||
self.blocks_by_root_requests.insert(
|
||||
@@ -608,6 +632,13 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
return Ok(LookupRequestResult::Pending("no peers"));
|
||||
};
|
||||
|
||||
let span = span!(
|
||||
Level::INFO,
|
||||
"SyncNetworkContext",
|
||||
service = "network_context"
|
||||
);
|
||||
let _enter = span.enter();
|
||||
|
||||
let imported_blob_indexes = self
|
||||
.chain
|
||||
.data_availability_checker
|
||||
@@ -643,13 +674,12 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
.map_err(|_| RpcRequestSendError::NetworkSendError)?;
|
||||
|
||||
debug!(
|
||||
self.log,
|
||||
"Sync RPC request sent";
|
||||
"method" => "BlobsByRoot",
|
||||
"block_root" => ?block_root,
|
||||
"blob_indices" => ?indices,
|
||||
"peer" => %peer_id,
|
||||
"id" => %id
|
||||
method = "BlobsByRoot",
|
||||
?block_root,
|
||||
blob_indices = ?indices,
|
||||
peer = %peer_id,
|
||||
%id,
|
||||
"Sync RPC request sent"
|
||||
);
|
||||
|
||||
self.blobs_by_root_requests.insert(
|
||||
@@ -673,6 +703,13 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
request: DataColumnsByRootSingleBlockRequest,
|
||||
expect_max_responses: bool,
|
||||
) -> Result<LookupRequestResult<DataColumnsByRootRequestId>, &'static str> {
|
||||
let span = span!(
|
||||
Level::INFO,
|
||||
"SyncNetworkContext",
|
||||
service = "network_context"
|
||||
);
|
||||
let _enter = span.enter();
|
||||
|
||||
let id = DataColumnsByRootRequestId {
|
||||
id: self.next_id(),
|
||||
requester,
|
||||
@@ -685,13 +722,12 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
})?;
|
||||
|
||||
debug!(
|
||||
self.log,
|
||||
"Sync RPC request sent";
|
||||
"method" => "DataColumnsByRoot",
|
||||
"block_root" => ?request.block_root,
|
||||
"indices" => ?request.indices,
|
||||
"peer" => %peer_id,
|
||||
"id" => %id,
|
||||
method = "DataColumnsByRoot",
|
||||
block_root = ?request.block_root,
|
||||
indices = ?request.indices,
|
||||
peer = %peer_id,
|
||||
%id,
|
||||
"Sync RPC request sent"
|
||||
);
|
||||
|
||||
self.data_columns_by_root_requests.insert(
|
||||
@@ -714,6 +750,13 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
block_root: Hash256,
|
||||
lookup_peers: Arc<RwLock<HashSet<PeerId>>>,
|
||||
) -> Result<LookupRequestResult, RpcRequestSendError> {
|
||||
let span = span!(
|
||||
Level::INFO,
|
||||
"SyncNetworkContext",
|
||||
service = "network_context"
|
||||
);
|
||||
let _enter = span.enter();
|
||||
|
||||
let custody_indexes_imported = self
|
||||
.chain
|
||||
.data_availability_checker
|
||||
@@ -740,11 +783,10 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
};
|
||||
|
||||
debug!(
|
||||
self.log,
|
||||
"Starting custody columns request";
|
||||
"block_root" => ?block_root,
|
||||
"indices" => ?custody_indexes_to_fetch,
|
||||
"id" => %id
|
||||
?block_root,
|
||||
indices = ?custody_indexes_to_fetch,
|
||||
%id,
|
||||
"Starting custody columns request"
|
||||
);
|
||||
|
||||
let requester = CustodyRequester(id);
|
||||
@@ -753,7 +795,6 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
CustodyId { requester },
|
||||
&custody_indexes_to_fetch,
|
||||
lookup_peers,
|
||||
self.log.clone(),
|
||||
);
|
||||
|
||||
// Note that you can only send, but not handle a response here
|
||||
@@ -788,13 +829,12 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
.map_err(|_| RpcRequestSendError::NetworkSendError)?;
|
||||
|
||||
debug!(
|
||||
self.log,
|
||||
"Sync RPC request sent";
|
||||
"method" => "BlocksByRange",
|
||||
"slots" => request.count(),
|
||||
"epoch" => Slot::new(*request.start_slot()).epoch(T::EthSpec::slots_per_epoch()),
|
||||
"peer" => %peer_id,
|
||||
"id" => %id,
|
||||
method = "BlocksByRange",
|
||||
slots = request.count(),
|
||||
epoch = %Slot::new(*request.start_slot()).epoch(T::EthSpec::slots_per_epoch()),
|
||||
peer = %peer_id,
|
||||
%id,
|
||||
"Sync RPC request sent"
|
||||
);
|
||||
|
||||
self.blocks_by_range_requests.insert(
|
||||
@@ -830,13 +870,12 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
.map_err(|_| RpcRequestSendError::NetworkSendError)?;
|
||||
|
||||
debug!(
|
||||
self.log,
|
||||
"Sync RPC request sent";
|
||||
"method" => "BlobsByRange",
|
||||
"slots" => request.count,
|
||||
"epoch" => request_epoch,
|
||||
"peer" => %peer_id,
|
||||
"id" => %id,
|
||||
method = "BlobsByRange",
|
||||
slots = request.count,
|
||||
epoch = %request_epoch,
|
||||
peer = %peer_id,
|
||||
%id,
|
||||
"Sync RPC request sent"
|
||||
);
|
||||
|
||||
let max_blobs_per_block = self.chain.spec.max_blobs_per_block(request_epoch);
|
||||
@@ -870,14 +909,13 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
.map_err(|_| RpcRequestSendError::NetworkSendError)?;
|
||||
|
||||
debug!(
|
||||
self.log,
|
||||
"Sync RPC request sent";
|
||||
"method" => "DataColumnsByRange",
|
||||
"slots" => request.count,
|
||||
"epoch" => Slot::new(request.start_slot).epoch(T::EthSpec::slots_per_epoch()),
|
||||
"columns" => ?request.columns,
|
||||
"peer" => %peer_id,
|
||||
"id" => %id,
|
||||
method = "DataColumnsByRange",
|
||||
slots = request.count,
|
||||
epoch = %Slot::new(request.start_slot).epoch(T::EthSpec::slots_per_epoch()),
|
||||
columns = ?request.columns,
|
||||
peer = %peer_id,
|
||||
%id,
|
||||
"Sync RPC request sent"
|
||||
);
|
||||
|
||||
self.data_columns_by_range_requests.insert(
|
||||
@@ -896,13 +934,26 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
}
|
||||
|
||||
pub fn update_execution_engine_state(&mut self, engine_state: EngineState) {
|
||||
debug!(self.log, "Sync's view on execution engine state updated";
|
||||
"past_state" => ?self.execution_engine_state, "new_state" => ?engine_state);
|
||||
let span = span!(
|
||||
Level::INFO,
|
||||
"SyncNetworkContext",
|
||||
service = "network_context"
|
||||
);
|
||||
let _enter = span.enter();
|
||||
|
||||
debug!(past_state = ?self.execution_engine_state, new_state = ?engine_state, "Sync's view on execution engine state updated");
|
||||
self.execution_engine_state = engine_state;
|
||||
}
|
||||
|
||||
/// Terminates the connection with the peer and bans them.
|
||||
pub fn goodbye_peer(&mut self, peer_id: PeerId, reason: GoodbyeReason) {
|
||||
let span = span!(
|
||||
Level::INFO,
|
||||
"SyncNetworkContext",
|
||||
service = "network_context"
|
||||
);
|
||||
let _enter = span.enter();
|
||||
|
||||
self.network_send
|
||||
.send(NetworkMessage::GoodbyePeer {
|
||||
peer_id,
|
||||
@@ -910,13 +961,20 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
source: ReportSource::SyncService,
|
||||
})
|
||||
.unwrap_or_else(|_| {
|
||||
warn!(self.log, "Could not report peer: channel failed");
|
||||
warn!("Could not report peer: channel failed");
|
||||
});
|
||||
}
|
||||
|
||||
/// Reports to the scoring algorithm the behaviour of a peer.
|
||||
pub fn report_peer(&self, peer_id: PeerId, action: PeerAction, msg: &'static str) {
|
||||
debug!(self.log, "Sync reporting peer"; "peer_id" => %peer_id, "action" => %action, "msg" => %msg);
|
||||
let span = span!(
|
||||
Level::INFO,
|
||||
"SyncNetworkContext",
|
||||
service = "network_context"
|
||||
);
|
||||
let _enter = span.enter();
|
||||
|
||||
debug!(%peer_id, %action, %msg, "Sync reporting peer");
|
||||
self.network_send
|
||||
.send(NetworkMessage::ReportPeer {
|
||||
peer_id,
|
||||
@@ -925,23 +983,37 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
msg,
|
||||
})
|
||||
.unwrap_or_else(|e| {
|
||||
warn!(self.log, "Could not report peer: channel failed"; "error"=> %e);
|
||||
warn!(error = %e, "Could not report peer: channel failed");
|
||||
});
|
||||
}
|
||||
|
||||
/// Subscribes to core topics.
|
||||
pub fn subscribe_core_topics(&self) {
|
||||
let span = span!(
|
||||
Level::INFO,
|
||||
"SyncNetworkContext",
|
||||
service = "network_context"
|
||||
);
|
||||
let _enter = span.enter();
|
||||
|
||||
self.network_send
|
||||
.send(NetworkMessage::SubscribeCoreTopics)
|
||||
.unwrap_or_else(|e| {
|
||||
warn!(self.log, "Could not subscribe to core topics."; "error" => %e);
|
||||
warn!(error = %e, "Could not subscribe to core topics.");
|
||||
});
|
||||
}
|
||||
|
||||
/// Sends an arbitrary network message.
|
||||
fn send_network_msg(&self, msg: NetworkMessage<T::EthSpec>) -> Result<(), &'static str> {
|
||||
let span = span!(
|
||||
Level::INFO,
|
||||
"SyncNetworkContext",
|
||||
service = "network_context"
|
||||
);
|
||||
let _enter = span.enter();
|
||||
|
||||
self.network_send.send(msg).map_err(|_| {
|
||||
debug!(self.log, "Could not send message to the network service");
|
||||
debug!("Could not send message to the network service");
|
||||
"Network channel send Failed"
|
||||
})
|
||||
}
|
||||
@@ -1128,20 +1200,18 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
None => {}
|
||||
Some(Ok((v, _))) => {
|
||||
debug!(
|
||||
self.log,
|
||||
"Sync RPC request completed";
|
||||
"id" => %id,
|
||||
"method" => method,
|
||||
"count" => get_count(v)
|
||||
%id,
|
||||
method,
|
||||
count = get_count(v),
|
||||
"Sync RPC request completed"
|
||||
);
|
||||
}
|
||||
Some(Err(e)) => {
|
||||
debug!(
|
||||
self.log,
|
||||
"Sync RPC request error";
|
||||
"id" => %id,
|
||||
"method" => method,
|
||||
"error" => ?e
|
||||
%id,
|
||||
method,
|
||||
error = ?e,
|
||||
"Sync RPC request error"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1166,11 +1236,18 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
peer_id: PeerId,
|
||||
resp: RpcResponseResult<Vec<Arc<DataColumnSidecar<T::EthSpec>>>>,
|
||||
) -> Option<CustodyByRootResult<T::EthSpec>> {
|
||||
let span = span!(
|
||||
Level::INFO,
|
||||
"SyncNetworkContext",
|
||||
service = "network_context"
|
||||
);
|
||||
let _enter = span.enter();
|
||||
|
||||
// Note: need to remove the request to borrow self again below. Otherwise we can't
|
||||
// do nested requests
|
||||
let Some(mut request) = self.custody_by_root_requests.remove(&id.requester) else {
|
||||
// TOOD(das): This log can happen if the request is error'ed early and dropped
|
||||
debug!(self.log, "Custody column downloaded event for unknown request"; "id" => ?id);
|
||||
debug!(?id, "Custody column downloaded event for unknown request");
|
||||
return None;
|
||||
};
|
||||
|
||||
@@ -1185,6 +1262,13 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
request: ActiveCustodyRequest<T>,
|
||||
result: CustodyRequestResult<T::EthSpec>,
|
||||
) -> Option<CustodyByRootResult<T::EthSpec>> {
|
||||
let span = span!(
|
||||
Level::INFO,
|
||||
"SyncNetworkContext",
|
||||
service = "network_context"
|
||||
);
|
||||
let _enter = span.enter();
|
||||
|
||||
let result = result
|
||||
.map_err(RpcResponseError::CustodyRequestError)
|
||||
.transpose();
|
||||
@@ -1193,10 +1277,10 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
// an Option first to use in an `if let Some() { act on result }` block.
|
||||
match result.as_ref() {
|
||||
Some(Ok((columns, peer_group, _))) => {
|
||||
debug!(self.log, "Custody request success, removing"; "id" => ?id, "count" => columns.len(), "peers" => ?peer_group)
|
||||
debug!(?id, count = columns.len(), peers = ?peer_group, "Custody request success, removing")
|
||||
}
|
||||
Some(Err(e)) => {
|
||||
debug!(self.log, "Custody request failure, removing"; "id" => ?id, "error" => ?e)
|
||||
debug!(?id, error = ?e, "Custody request failure, removing" )
|
||||
}
|
||||
None => {
|
||||
self.custody_by_root_requests.insert(id, request);
|
||||
@@ -1212,11 +1296,18 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
block: RpcBlock<T::EthSpec>,
|
||||
seen_timestamp: Duration,
|
||||
) -> Result<(), SendErrorProcessor> {
|
||||
let span = span!(
|
||||
Level::INFO,
|
||||
"SyncNetworkContext",
|
||||
service = "network_context"
|
||||
);
|
||||
let _enter = span.enter();
|
||||
|
||||
let beacon_processor = self
|
||||
.beacon_processor_if_enabled()
|
||||
.ok_or(SendErrorProcessor::ProcessorNotAvailable)?;
|
||||
|
||||
debug!(self.log, "Sending block for processing"; "block" => ?block_root, "id" => id);
|
||||
debug!(block = ?block_root, id, "Sending block for processing");
|
||||
// Lookup sync event safety: If `beacon_processor.send_rpc_beacon_block` returns Ok() sync
|
||||
// must receive a single `SyncMessage::BlockComponentProcessed` with this process type
|
||||
beacon_processor
|
||||
@@ -1228,9 +1319,8 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
)
|
||||
.map_err(|e| {
|
||||
error!(
|
||||
self.log,
|
||||
"Failed to send sync block to processor";
|
||||
"error" => ?e
|
||||
error = ?e,
|
||||
"Failed to send sync block to processor"
|
||||
);
|
||||
SendErrorProcessor::SendError
|
||||
})
|
||||
@@ -1243,11 +1333,18 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
blobs: FixedBlobSidecarList<T::EthSpec>,
|
||||
seen_timestamp: Duration,
|
||||
) -> Result<(), SendErrorProcessor> {
|
||||
let span = span!(
|
||||
Level::INFO,
|
||||
"SyncNetworkContext",
|
||||
service = "network_context"
|
||||
);
|
||||
let _enter = span.enter();
|
||||
|
||||
let beacon_processor = self
|
||||
.beacon_processor_if_enabled()
|
||||
.ok_or(SendErrorProcessor::ProcessorNotAvailable)?;
|
||||
|
||||
debug!(self.log, "Sending blobs for processing"; "block" => ?block_root, "id" => id);
|
||||
debug!(?block_root, ?id, "Sending blobs for processing");
|
||||
// Lookup sync event safety: If `beacon_processor.send_rpc_blobs` returns Ok() sync
|
||||
// must receive a single `SyncMessage::BlockComponentProcessed` event with this process type
|
||||
beacon_processor
|
||||
@@ -1259,9 +1356,8 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
)
|
||||
.map_err(|e| {
|
||||
error!(
|
||||
self.log,
|
||||
"Failed to send sync blobs to processor";
|
||||
"error" => ?e
|
||||
error = ?e,
|
||||
"Failed to send sync blobs to processor"
|
||||
);
|
||||
SendErrorProcessor::SendError
|
||||
})
|
||||
@@ -1275,19 +1371,29 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
|
||||
seen_timestamp: Duration,
|
||||
process_type: BlockProcessType,
|
||||
) -> Result<(), SendErrorProcessor> {
|
||||
let span = span!(
|
||||
Level::INFO,
|
||||
"SyncNetworkContext",
|
||||
service = "network_context"
|
||||
);
|
||||
let _enter = span.enter();
|
||||
|
||||
let beacon_processor = self
|
||||
.beacon_processor_if_enabled()
|
||||
.ok_or(SendErrorProcessor::ProcessorNotAvailable)?;
|
||||
|
||||
debug!(self.log, "Sending custody columns for processing"; "block" => ?block_root, "process_type" => ?process_type);
|
||||
debug!(
|
||||
?block_root,
|
||||
?process_type,
|
||||
"Sending custody columns for processing"
|
||||
);
|
||||
|
||||
beacon_processor
|
||||
.send_rpc_custody_columns(block_root, custody_columns, seen_timestamp, process_type)
|
||||
.map_err(|e| {
|
||||
error!(
|
||||
self.log,
|
||||
"Failed to send sync custody columns to processor";
|
||||
"error" => ?e
|
||||
error = ?e,
|
||||
"Failed to send sync custody columns to processor"
|
||||
);
|
||||
SendErrorProcessor::SendError
|
||||
})
|
||||
|
||||
@@ -9,10 +9,10 @@ use lighthouse_network::PeerId;
|
||||
use lru_cache::LRUTimeCache;
|
||||
use parking_lot::RwLock;
|
||||
use rand::Rng;
|
||||
use slog::{debug, warn};
|
||||
use std::collections::HashSet;
|
||||
use std::time::{Duration, Instant};
|
||||
use std::{collections::HashMap, marker::PhantomData, sync::Arc};
|
||||
use tracing::{debug, warn};
|
||||
use types::EthSpec;
|
||||
use types::{data_column_sidecar::ColumnIndex, DataColumnSidecar, Hash256};
|
||||
|
||||
@@ -36,8 +36,7 @@ pub struct ActiveCustodyRequest<T: BeaconChainTypes> {
|
||||
failed_peers: LRUTimeCache<PeerId>,
|
||||
/// Set of peers that claim to have imported this block and their custody columns
|
||||
lookup_peers: Arc<RwLock<HashSet<PeerId>>>,
|
||||
/// Logger for the `SyncNetworkContext`.
|
||||
pub log: slog::Logger,
|
||||
|
||||
_phantom: PhantomData<T>,
|
||||
}
|
||||
|
||||
@@ -70,7 +69,6 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
|
||||
custody_id: CustodyId,
|
||||
column_indices: &[ColumnIndex],
|
||||
lookup_peers: Arc<RwLock<HashSet<PeerId>>>,
|
||||
log: slog::Logger,
|
||||
) -> Self {
|
||||
Self {
|
||||
block_root,
|
||||
@@ -83,7 +81,6 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
|
||||
active_batch_columns_requests: <_>::default(),
|
||||
failed_peers: LRUTimeCache::new(Duration::from_secs(FAILED_PEERS_CACHE_EXPIRY_SECONDS)),
|
||||
lookup_peers,
|
||||
log,
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
@@ -104,24 +101,24 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
|
||||
cx: &mut SyncNetworkContext<T>,
|
||||
) -> CustodyRequestResult<T::EthSpec> {
|
||||
let Some(batch_request) = self.active_batch_columns_requests.get_mut(&req_id) else {
|
||||
warn!(self.log,
|
||||
"Received custody column response for unrequested index";
|
||||
"id" => ?self.custody_id,
|
||||
"block_root" => ?self.block_root,
|
||||
"req_id" => %req_id,
|
||||
warn!(
|
||||
id = ?self.custody_id,
|
||||
block_root = ?self.block_root,
|
||||
%req_id,
|
||||
"Received custody column response for unrequested index"
|
||||
);
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
match resp {
|
||||
Ok((data_columns, seen_timestamp)) => {
|
||||
debug!(self.log,
|
||||
"Custody column download success";
|
||||
"id" => ?self.custody_id,
|
||||
"block_root" => ?self.block_root,
|
||||
"req_id" => %req_id,
|
||||
"peer" => %peer_id,
|
||||
"count" => data_columns.len()
|
||||
debug!(
|
||||
id = ?self.custody_id,
|
||||
block_root = ?self.block_root,
|
||||
%req_id,
|
||||
%peer_id,
|
||||
count = data_columns.len(),
|
||||
"Custody column download success"
|
||||
);
|
||||
|
||||
// Map columns by index as an optimization to not loop the returned list on each
|
||||
@@ -163,27 +160,27 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
|
||||
|
||||
if !missing_column_indexes.is_empty() {
|
||||
// Note: Batch logging that columns are missing to not spam logger
|
||||
debug!(self.log,
|
||||
"Custody column peer claims to not have some data";
|
||||
"id" => ?self.custody_id,
|
||||
"block_root" => ?self.block_root,
|
||||
"req_id" => %req_id,
|
||||
"peer" => %peer_id,
|
||||
debug!(
|
||||
id = ?self.custody_id,
|
||||
block_root = ?self.block_root,
|
||||
%req_id,
|
||||
%peer_id,
|
||||
// TODO(das): this property can become very noisy, being the full range 0..128
|
||||
"missing_column_indexes" => ?missing_column_indexes
|
||||
?missing_column_indexes,
|
||||
"Custody column peer claims to not have some data"
|
||||
);
|
||||
|
||||
self.failed_peers.insert(peer_id);
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
debug!(self.log,
|
||||
"Custody column download error";
|
||||
"id" => ?self.custody_id,
|
||||
"block_root" => ?self.block_root,
|
||||
"req_id" => %req_id,
|
||||
"peer" => %peer_id,
|
||||
"error" => ?err
|
||||
debug!(
|
||||
id = ?self.custody_id,
|
||||
block_root = ?self.block_root,
|
||||
%req_id,
|
||||
%peer_id,
|
||||
error = ?err,
|
||||
"Custody column download error"
|
||||
);
|
||||
|
||||
// TODO(das): Should mark peer as failed and try from another peer
|
||||
|
||||
@@ -12,11 +12,11 @@ use lighthouse_network::service::api_types::{
|
||||
};
|
||||
use lighthouse_network::{PeerAction, PeerId};
|
||||
use rand::{seq::SliceRandom, thread_rng};
|
||||
use slog::{debug, error, warn};
|
||||
use std::{
|
||||
collections::hash_map::Entry, collections::HashMap, marker::PhantomData, sync::Arc,
|
||||
time::Duration,
|
||||
};
|
||||
use tracing::{debug, error, instrument, warn};
|
||||
use types::{data_column_sidecar::ColumnIndex, ChainSpec, DataColumnSidecar, Hash256};
|
||||
|
||||
pub type SamplingResult = Result<(), SamplingError>;
|
||||
@@ -26,24 +26,35 @@ type DataColumnSidecarList<E> = Vec<Arc<DataColumnSidecar<E>>>;
|
||||
pub struct Sampling<T: BeaconChainTypes> {
|
||||
requests: HashMap<SamplingRequester, ActiveSamplingRequest<T>>,
|
||||
sampling_config: SamplingConfig,
|
||||
log: slog::Logger,
|
||||
}
|
||||
|
||||
impl<T: BeaconChainTypes> Sampling<T> {
|
||||
pub fn new(sampling_config: SamplingConfig, log: slog::Logger) -> Self {
|
||||
#[instrument(parent = None,level = "info", fields(service = "sampling"), name = "sampling")]
|
||||
pub fn new(sampling_config: SamplingConfig) -> Self {
|
||||
Self {
|
||||
requests: <_>::default(),
|
||||
sampling_config,
|
||||
log,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "sampling"),
|
||||
name = "sampling",
|
||||
skip_all
|
||||
)]
|
||||
pub fn active_sampling_requests(&self) -> Vec<Hash256> {
|
||||
self.requests.values().map(|r| r.block_root).collect()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "sampling"),
|
||||
name = "sampling",
|
||||
skip_all
|
||||
)]
|
||||
pub fn get_request_status(
|
||||
&self,
|
||||
block_root: Hash256,
|
||||
@@ -61,6 +72,12 @@ impl<T: BeaconChainTypes> Sampling<T> {
|
||||
///
|
||||
/// - `Some`: Request completed, won't make more progress. Expect requester to act on the result.
|
||||
/// - `None`: Request still active, requester should do no action
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "sampling"),
|
||||
name = "sampling",
|
||||
skip_all
|
||||
)]
|
||||
pub fn on_new_sample_request(
|
||||
&mut self,
|
||||
block_root: Hash256,
|
||||
@@ -73,7 +90,6 @@ impl<T: BeaconChainTypes> Sampling<T> {
|
||||
block_root,
|
||||
id,
|
||||
&self.sampling_config,
|
||||
self.log.clone(),
|
||||
&cx.chain.spec,
|
||||
)),
|
||||
Entry::Occupied(_) => {
|
||||
@@ -82,15 +98,15 @@ impl<T: BeaconChainTypes> Sampling<T> {
|
||||
// TODO(das): Should track failed sampling request for some time? Otherwise there's
|
||||
// a risk of a loop with multiple triggers creating the request, then failing,
|
||||
// and repeat.
|
||||
debug!(self.log, "Ignoring duplicate sampling request"; "id" => ?id);
|
||||
debug!(?id, "Ignoring duplicate sampling request");
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
debug!(self.log,
|
||||
"Created new sample request";
|
||||
"id" => ?id,
|
||||
"column_selection" => ?request.column_selection()
|
||||
debug!(
|
||||
?id,
|
||||
column_selection = ?request.column_selection(),
|
||||
"Created new sample request"
|
||||
);
|
||||
|
||||
// TOOD(das): If a node has very little peers, continue_sampling() will attempt to find enough
|
||||
@@ -107,6 +123,12 @@ impl<T: BeaconChainTypes> Sampling<T> {
|
||||
///
|
||||
/// - `Some`: Request completed, won't make more progress. Expect requester to act on the result.
|
||||
/// - `None`: Request still active, requester should do no action
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "sampling"),
|
||||
name = "sampling",
|
||||
skip_all
|
||||
)]
|
||||
pub fn on_sample_downloaded(
|
||||
&mut self,
|
||||
id: SamplingId,
|
||||
@@ -116,7 +138,7 @@ impl<T: BeaconChainTypes> Sampling<T> {
|
||||
) -> Option<(SamplingRequester, SamplingResult)> {
|
||||
let Some(request) = self.requests.get_mut(&id.id) else {
|
||||
// TOOD(das): This log can happen if the request is error'ed early and dropped
|
||||
debug!(self.log, "Sample downloaded event for unknown request"; "id" => ?id);
|
||||
debug!(?id, "Sample downloaded event for unknown request");
|
||||
return None;
|
||||
};
|
||||
|
||||
@@ -131,6 +153,12 @@ impl<T: BeaconChainTypes> Sampling<T> {
|
||||
///
|
||||
/// - `Some`: Request completed, won't make more progress. Expect requester to act on the result.
|
||||
/// - `None`: Request still active, requester should do no action
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "sampling"),
|
||||
name = "sampling",
|
||||
skip_all
|
||||
)]
|
||||
pub fn on_sample_verified(
|
||||
&mut self,
|
||||
id: SamplingId,
|
||||
@@ -139,7 +167,7 @@ impl<T: BeaconChainTypes> Sampling<T> {
|
||||
) -> Option<(SamplingRequester, SamplingResult)> {
|
||||
let Some(request) = self.requests.get_mut(&id.id) else {
|
||||
// TOOD(das): This log can happen if the request is error'ed early and dropped
|
||||
debug!(self.log, "Sample verified event for unknown request"; "id" => ?id);
|
||||
debug!(?id, "Sample verified event for unknown request");
|
||||
return None;
|
||||
};
|
||||
|
||||
@@ -150,6 +178,12 @@ impl<T: BeaconChainTypes> Sampling<T> {
|
||||
/// Converts a result from the internal format of `ActiveSamplingRequest` (error first to use ?
|
||||
/// conveniently), to an Option first format to use an `if let Some() { act on result }` pattern
|
||||
/// in the sync manager.
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(service = "sampling"),
|
||||
name = "sampling",
|
||||
skip_all
|
||||
)]
|
||||
fn handle_sampling_result(
|
||||
&mut self,
|
||||
result: Result<Option<()>, SamplingError>,
|
||||
@@ -157,7 +191,7 @@ impl<T: BeaconChainTypes> Sampling<T> {
|
||||
) -> Option<(SamplingRequester, SamplingResult)> {
|
||||
let result = result.transpose();
|
||||
if let Some(result) = result {
|
||||
debug!(self.log, "Sampling request completed, removing"; "id" => ?id, "result" => ?result);
|
||||
debug!(?id, ?result, "Sampling request completed, removing");
|
||||
metrics::inc_counter_vec(
|
||||
&metrics::SAMPLING_REQUEST_RESULT,
|
||||
&[metrics::from_result(&result)],
|
||||
@@ -180,8 +214,6 @@ pub struct ActiveSamplingRequest<T: BeaconChainTypes> {
|
||||
current_sampling_request_id: SamplingRequestId,
|
||||
column_shuffle: Vec<ColumnIndex>,
|
||||
required_successes: Vec<usize>,
|
||||
/// Logger for the `SyncNetworkContext`.
|
||||
pub log: slog::Logger,
|
||||
_phantom: PhantomData<T>,
|
||||
}
|
||||
|
||||
@@ -212,7 +244,6 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
|
||||
block_root: Hash256,
|
||||
requester_id: SamplingRequester,
|
||||
sampling_config: &SamplingConfig,
|
||||
log: slog::Logger,
|
||||
spec: &ChainSpec,
|
||||
) -> Self {
|
||||
// Select ahead of time the full list of to-sample columns
|
||||
@@ -232,7 +263,6 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
|
||||
SamplingConfig::Default => REQUIRED_SUCCESSES.to_vec(),
|
||||
SamplingConfig::Custom { required_successes } => required_successes.clone(),
|
||||
},
|
||||
log,
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
@@ -275,9 +305,9 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
|
||||
.column_indexes_by_sampling_request
|
||||
.get(&sampling_request_id)
|
||||
else {
|
||||
error!(self.log,
|
||||
"Column indexes for the sampling request ID not found";
|
||||
"sampling_request_id" => ?sampling_request_id
|
||||
error!(
|
||||
?sampling_request_id,
|
||||
"Column indexes for the sampling request ID not found"
|
||||
);
|
||||
return Ok(None);
|
||||
};
|
||||
@@ -288,11 +318,11 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
|
||||
.iter()
|
||||
.map(|r| r.index)
|
||||
.collect::<Vec<_>>();
|
||||
debug!(self.log,
|
||||
"Sample download success";
|
||||
"block_root" => %self.block_root,
|
||||
"column_indexes" => ?resp_column_indexes,
|
||||
"count" => resp_data_columns.len()
|
||||
debug!(
|
||||
block_root = %self.block_root,
|
||||
column_indexes = ?resp_column_indexes,
|
||||
count = resp_data_columns.len(),
|
||||
"Sample download success"
|
||||
);
|
||||
metrics::inc_counter_vec(&metrics::SAMPLE_DOWNLOAD_RESULT, &[metrics::SUCCESS]);
|
||||
|
||||
@@ -300,10 +330,10 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
|
||||
let mut data_columns = vec![];
|
||||
for column_index in column_indexes {
|
||||
let Some(request) = self.column_requests.get_mut(column_index) else {
|
||||
warn!(self.log,
|
||||
"Active column sample request not found";
|
||||
"block_root" => %self.block_root,
|
||||
"column_index" => column_index
|
||||
warn!(
|
||||
block_root = %self.block_root,
|
||||
column_index,
|
||||
"Active column sample request not found"
|
||||
);
|
||||
continue;
|
||||
};
|
||||
@@ -314,10 +344,10 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
|
||||
else {
|
||||
// Peer does not have the requested data, mark peer as "dont have" and try
|
||||
// again with a different peer.
|
||||
debug!(self.log,
|
||||
"Sampling peer claims to not have the data";
|
||||
"block_root" => %self.block_root,
|
||||
"column_index" => column_index
|
||||
debug!(
|
||||
block_root = %self.block_root,
|
||||
column_index,
|
||||
"Sampling peer claims to not have the data"
|
||||
);
|
||||
request.on_sampling_error()?;
|
||||
continue;
|
||||
@@ -331,16 +361,16 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
|
||||
.iter()
|
||||
.map(|d| d.index)
|
||||
.collect::<Vec<_>>();
|
||||
debug!(self.log,
|
||||
"Received data that was not requested";
|
||||
"block_root" => %self.block_root,
|
||||
"column_indexes" => ?resp_column_indexes
|
||||
debug!(
|
||||
block_root = %self.block_root,
|
||||
column_indexes = ?resp_column_indexes,
|
||||
"Received data that was not requested"
|
||||
);
|
||||
}
|
||||
|
||||
// Handle the downloaded data columns.
|
||||
if data_columns.is_empty() {
|
||||
debug!(self.log, "Received empty response"; "block_root" => %self.block_root);
|
||||
debug!(block_root = %self.block_root, "Received empty response");
|
||||
self.column_indexes_by_sampling_request
|
||||
.remove(&sampling_request_id);
|
||||
} else {
|
||||
@@ -351,17 +381,17 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
|
||||
// Peer has data column, send to verify
|
||||
let Some(beacon_processor) = cx.beacon_processor_if_enabled() else {
|
||||
// If processor is not available, error the entire sampling
|
||||
debug!(self.log,
|
||||
"Dropping sampling";
|
||||
"block" => %self.block_root,
|
||||
"reason" => "beacon processor unavailable"
|
||||
debug!(
|
||||
block = %self.block_root,
|
||||
reason = "beacon processor unavailable",
|
||||
"Dropping sampling"
|
||||
);
|
||||
return Err(SamplingError::ProcessorUnavailable);
|
||||
};
|
||||
debug!(self.log,
|
||||
"Sending data_column for verification";
|
||||
"block" => ?self.block_root,
|
||||
"column_indexes" => ?column_indexes
|
||||
debug!(
|
||||
block = ?self.block_root,
|
||||
?column_indexes,
|
||||
"Sending data_column for verification"
|
||||
);
|
||||
if let Err(e) = beacon_processor.send_rpc_validate_data_columns(
|
||||
self.block_root,
|
||||
@@ -375,20 +405,21 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
|
||||
// Beacon processor is overloaded, drop sampling attempt. Failing to sample
|
||||
// is not a permanent state so we should recover once the node has capacity
|
||||
// and receives a descendant block.
|
||||
error!(self.log,
|
||||
"Dropping sampling";
|
||||
"block" => %self.block_root,
|
||||
"reason" => e.to_string()
|
||||
error!(
|
||||
block = %self.block_root,
|
||||
reason = e.to_string(),
|
||||
"Dropping sampling"
|
||||
);
|
||||
return Err(SamplingError::SendFailed("beacon processor send failure"));
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
debug!(self.log, "Sample download error";
|
||||
"block_root" => %self.block_root,
|
||||
"column_indexes" => ?column_indexes,
|
||||
"error" => ?err
|
||||
debug!(
|
||||
block_root = %self.block_root,
|
||||
?column_indexes,
|
||||
error = ?err,
|
||||
"Sample download error"
|
||||
);
|
||||
metrics::inc_counter_vec(&metrics::SAMPLE_DOWNLOAD_RESULT, &[metrics::FAILURE]);
|
||||
|
||||
@@ -396,10 +427,10 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
|
||||
// reaching this function. Mark the peer as failed and try again with another.
|
||||
for column_index in column_indexes {
|
||||
let Some(request) = self.column_requests.get_mut(column_index) else {
|
||||
warn!(self.log,
|
||||
"Active column sample request not found";
|
||||
"block_root" => %self.block_root,
|
||||
"column_index" => column_index
|
||||
warn!(
|
||||
block_root = %self.block_root,
|
||||
column_index,
|
||||
"Active column sample request not found"
|
||||
);
|
||||
continue;
|
||||
};
|
||||
@@ -429,21 +460,24 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
|
||||
.column_indexes_by_sampling_request
|
||||
.get(&sampling_request_id)
|
||||
else {
|
||||
error!(self.log, "Column indexes for the sampling request ID not found"; "sampling_request_id" => ?sampling_request_id);
|
||||
error!(
|
||||
?sampling_request_id,
|
||||
"Column indexes for the sampling request ID not found"
|
||||
);
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
match result {
|
||||
Ok(_) => {
|
||||
debug!(self.log, "Sample verification success"; "block_root" => %self.block_root, "column_indexes" => ?column_indexes);
|
||||
debug!(block_root = %self.block_root,?column_indexes, "Sample verification success");
|
||||
metrics::inc_counter_vec(&metrics::SAMPLE_VERIFY_RESULT, &[metrics::SUCCESS]);
|
||||
|
||||
// Valid, continue_sampling will maybe consider sampling succees
|
||||
for column_index in column_indexes {
|
||||
let Some(request) = self.column_requests.get_mut(column_index) else {
|
||||
warn!(
|
||||
self.log,
|
||||
"Active column sample request not found"; "block_root" => %self.block_root, "column_index" => column_index
|
||||
block_root = %self.block_root, column_index,
|
||||
"Active column sample request not found"
|
||||
);
|
||||
continue;
|
||||
};
|
||||
@@ -451,7 +485,7 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
debug!(self.log, "Sample verification failure"; "block_root" => %self.block_root, "column_indexes" => ?column_indexes, "reason" => ?err);
|
||||
debug!(block_root = %self.block_root, ?column_indexes, reason = ?err, "Sample verification failure");
|
||||
metrics::inc_counter_vec(&metrics::SAMPLE_VERIFY_RESULT, &[metrics::FAILURE]);
|
||||
|
||||
// Peer sent invalid data, penalize and try again from different peer
|
||||
@@ -459,8 +493,9 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
|
||||
for column_index in column_indexes {
|
||||
let Some(request) = self.column_requests.get_mut(column_index) else {
|
||||
warn!(
|
||||
self.log,
|
||||
"Active column sample request not found"; "block_root" => %self.block_root, "column_index" => column_index
|
||||
block_root = %self.block_root,
|
||||
column_index,
|
||||
"Active column sample request not found"
|
||||
);
|
||||
continue;
|
||||
};
|
||||
@@ -570,7 +605,7 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
|
||||
// request was sent, loop to increase the required_successes until the sampling fails if
|
||||
// there are no peers.
|
||||
if ongoings == 0 && !sent_request {
|
||||
debug!(self.log, "Sampling request stalled"; "block_root" => %self.block_root);
|
||||
debug!(block_root = %self.block_root, "Sampling request stalled");
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
|
||||
@@ -3,6 +3,7 @@ use lighthouse_network::rpc::methods::BlocksByRangeRequest;
|
||||
use lighthouse_network::service::api_types::Id;
|
||||
use lighthouse_network::PeerId;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
use std::hash::{Hash, Hasher};
|
||||
use std::ops::Sub;
|
||||
use std::time::{Duration, Instant};
|
||||
@@ -61,6 +62,7 @@ pub trait BatchConfig {
|
||||
fn batch_attempt_hash<E: EthSpec>(blocks: &[RpcBlock<E>]) -> u64;
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct RangeSyncBatchConfig {}
|
||||
|
||||
impl BatchConfig for RangeSyncBatchConfig {
|
||||
@@ -93,6 +95,7 @@ pub enum BatchProcessingResult {
|
||||
NonFaultyFailure,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
/// A segment of a chain.
|
||||
pub struct BatchInfo<E: EthSpec, B: BatchConfig = RangeSyncBatchConfig> {
|
||||
/// Start slot of the batch.
|
||||
@@ -113,6 +116,17 @@ pub struct BatchInfo<E: EthSpec, B: BatchConfig = RangeSyncBatchConfig> {
|
||||
marker: std::marker::PhantomData<B>,
|
||||
}
|
||||
|
||||
impl<E: EthSpec, B: BatchConfig> fmt::Display for BatchInfo<E, B> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"Start Slot: {}, End Slot: {}, State: {}",
|
||||
self.start_slot, self.end_slot, self.state
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Display)]
|
||||
/// Current state of a batch
|
||||
pub enum BatchState<E: EthSpec> {
|
||||
/// The batch has failed either downloading or processing, but can be requested again.
|
||||
@@ -190,15 +204,6 @@ impl<E: EthSpec, B: BatchConfig> BatchInfo<E, B> {
|
||||
peers
|
||||
}
|
||||
|
||||
/// Return the number of times this batch has failed downloading and failed processing, in this
|
||||
/// order.
|
||||
pub fn failed_attempts(&self) -> (usize, usize) {
|
||||
(
|
||||
self.failed_download_attempts.len(),
|
||||
self.failed_processing_attempts.len(),
|
||||
)
|
||||
}
|
||||
|
||||
/// Verifies if an incoming block belongs to this batch.
|
||||
pub fn is_expecting_block(&self, request_id: &Id) -> bool {
|
||||
if let BatchState::Downloading(_, expected_id) = &self.state {
|
||||
@@ -456,39 +461,6 @@ impl Attempt {
|
||||
}
|
||||
}
|
||||
|
||||
impl<E: EthSpec, B: BatchConfig> slog::KV for &mut BatchInfo<E, B> {
|
||||
fn serialize(
|
||||
&self,
|
||||
record: &slog::Record,
|
||||
serializer: &mut dyn slog::Serializer,
|
||||
) -> slog::Result {
|
||||
slog::KV::serialize(*self, record, serializer)
|
||||
}
|
||||
}
|
||||
|
||||
impl<E: EthSpec, B: BatchConfig> slog::KV for BatchInfo<E, B> {
|
||||
fn serialize(
|
||||
&self,
|
||||
record: &slog::Record,
|
||||
serializer: &mut dyn slog::Serializer,
|
||||
) -> slog::Result {
|
||||
use slog::Value;
|
||||
Value::serialize(&self.start_slot, record, "start_slot", serializer)?;
|
||||
Value::serialize(
|
||||
&(self.end_slot - 1), // NOTE: The -1 shows inclusive blocks
|
||||
record,
|
||||
"end_slot",
|
||||
serializer,
|
||||
)?;
|
||||
serializer.emit_usize("downloaded", self.failed_download_attempts.len())?;
|
||||
serializer.emit_usize("processed", self.failed_processing_attempts.len())?;
|
||||
serializer.emit_u8("processed_no_penalty", self.non_faulty_processing_attempts)?;
|
||||
serializer.emit_arguments("state", &format_args!("{:?}", self.state))?;
|
||||
serializer.emit_arguments("batch_ty", &format_args!("{}", self.batch_type))?;
|
||||
slog::Result::Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<E: EthSpec> std::fmt::Debug for BatchState<E> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
|
||||
@@ -9,11 +9,13 @@ use beacon_chain::BeaconChainTypes;
|
||||
use fnv::FnvHashMap;
|
||||
use lighthouse_network::service::api_types::Id;
|
||||
use lighthouse_network::{PeerAction, PeerId};
|
||||
use logging::crit;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::Rng;
|
||||
use slog::{crit, debug, o, warn};
|
||||
use std::collections::{btree_map::Entry, BTreeMap, HashSet};
|
||||
use std::fmt;
|
||||
use strum::IntoStaticStr;
|
||||
use tracing::{debug, instrument, warn};
|
||||
use types::{Epoch, EthSpec, Hash256, Slot};
|
||||
|
||||
/// Blocks are downloaded in batches from peers. This constant specifies how many epochs worth of
|
||||
@@ -37,6 +39,7 @@ pub type ProcessingResult = Result<KeepChain, RemoveChain>;
|
||||
|
||||
/// Reasons for removing a chain
|
||||
#[derive(Debug)]
|
||||
#[allow(dead_code)]
|
||||
pub enum RemoveChain {
|
||||
EmptyPeerPool,
|
||||
ChainCompleted,
|
||||
@@ -66,6 +69,7 @@ pub enum SyncingChainType {
|
||||
/// A chain of blocks that need to be downloaded. Peers who claim to contain the target head
|
||||
/// root are grouped into the peer pool and queried for batches when downloading the
|
||||
/// chain.
|
||||
#[derive(Debug)]
|
||||
pub struct SyncingChain<T: BeaconChainTypes> {
|
||||
/// A random id used to identify this chain.
|
||||
id: ChainId,
|
||||
@@ -110,9 +114,16 @@ pub struct SyncingChain<T: BeaconChainTypes> {
|
||||
|
||||
/// The current processing batch, if any.
|
||||
current_processing_batch: Option<BatchId>,
|
||||
}
|
||||
|
||||
/// The chain's log.
|
||||
log: slog::Logger,
|
||||
impl<T: BeaconChainTypes> fmt::Display for SyncingChain<T> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self.chain_type {
|
||||
SyncingChainType::Head => write!(f, "Head"),
|
||||
SyncingChainType::Finalized => write!(f, "Finalized"),
|
||||
SyncingChainType::Backfill => write!(f, "Backfill"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Debug)]
|
||||
@@ -132,7 +143,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
target_head_root: Hash256,
|
||||
peer_id: PeerId,
|
||||
chain_type: SyncingChainType,
|
||||
log: &slog::Logger,
|
||||
) -> Self {
|
||||
let mut peers = FnvHashMap::default();
|
||||
peers.insert(peer_id, Default::default());
|
||||
@@ -151,7 +161,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
attempted_optimistic_starts: HashSet::default(),
|
||||
state: ChainSyncingState::Stopped,
|
||||
current_processing_batch: None,
|
||||
log: log.new(o!("chain" => id)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -161,21 +170,25 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
}
|
||||
|
||||
/// Check if the chain has peers from which to process batches.
|
||||
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
pub fn available_peers(&self) -> usize {
|
||||
self.peers.len()
|
||||
}
|
||||
|
||||
/// Get the chain's id.
|
||||
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
pub fn get_id(&self) -> ChainId {
|
||||
self.id
|
||||
}
|
||||
|
||||
/// Peers currently syncing this chain.
|
||||
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
pub fn peers(&self) -> impl Iterator<Item = PeerId> + '_ {
|
||||
self.peers.keys().cloned()
|
||||
}
|
||||
|
||||
/// Progress in epochs made by the chain
|
||||
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
pub fn processed_epochs(&self) -> u64 {
|
||||
self.processing_target
|
||||
.saturating_sub(self.start_epoch)
|
||||
@@ -183,6 +196,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
}
|
||||
|
||||
/// Returns the total count of pending blocks in all the batches of this chain
|
||||
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
pub fn pending_blocks(&self) -> usize {
|
||||
self.batches
|
||||
.values()
|
||||
@@ -192,6 +206,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
|
||||
/// Removes a peer from the chain.
|
||||
/// If the peer has active batches, those are considered failed and re-requested.
|
||||
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
pub fn remove_peer(
|
||||
&mut self,
|
||||
peer_id: &PeerId,
|
||||
@@ -211,8 +226,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
}
|
||||
self.retry_batch_download(network, id)?;
|
||||
} else {
|
||||
debug!(self.log, "Batch not found while removing peer";
|
||||
"peer" => %peer_id, "batch" => id)
|
||||
debug!(%peer_id, batch = ?id, "Batch not found while removing peer")
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -225,6 +239,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
}
|
||||
|
||||
/// Returns the latest slot number that has been processed.
|
||||
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
fn current_processed_slot(&self) -> Slot {
|
||||
// the last slot we processed was included in the previous batch, and corresponds to the
|
||||
// first slot of the current target epoch
|
||||
@@ -234,6 +249,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
|
||||
/// A block has been received for a batch on this chain.
|
||||
/// If the block correctly completes the batch it will be processed if possible.
|
||||
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
pub fn on_block_response(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -245,7 +261,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
// check if we have this batch
|
||||
let batch = match self.batches.get_mut(&batch_id) {
|
||||
None => {
|
||||
debug!(self.log, "Received a block for unknown batch"; "epoch" => batch_id);
|
||||
debug!(epoch = %batch_id, "Received a block for unknown batch");
|
||||
// A batch might get removed when the chain advances, so this is non fatal.
|
||||
return Ok(KeepChain);
|
||||
}
|
||||
@@ -273,7 +289,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
let awaiting_batches = batch_id
|
||||
.saturating_sub(self.optimistic_start.unwrap_or(self.processing_target))
|
||||
/ EPOCHS_PER_BATCH;
|
||||
debug!(self.log, "Batch downloaded"; "epoch" => batch_id, "blocks" => received, "batch_state" => self.visualize_batch_state(), "awaiting_batches" => awaiting_batches);
|
||||
debug!(epoch = %batch_id, blocks = received, batch_state = self.visualize_batch_state(), %awaiting_batches,"Batch downloaded");
|
||||
|
||||
// pre-emptively request more blocks from peers whilst we process current blocks,
|
||||
self.request_batches(network)?;
|
||||
@@ -282,6 +298,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
|
||||
/// Processes the batch with the given id.
|
||||
/// The batch must exist and be ready for processing
|
||||
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
fn process_batch(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -317,8 +334,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
self.current_processing_batch = Some(batch_id);
|
||||
|
||||
if let Err(e) = beacon_processor.send_chain_segment(process_id, blocks) {
|
||||
crit!(self.log, "Failed to send chain segment to processor."; "msg" => "process_batch",
|
||||
"error" => %e, "batch" => self.processing_target);
|
||||
crit!(msg = "process_batch",error = %e, batch = ?self.processing_target, "Failed to send chain segment to processor.");
|
||||
// This is unlikely to happen but it would stall syncing since the batch now has no
|
||||
// blocks to continue, and the chain is expecting a processing result that won't
|
||||
// arrive. To mitigate this, (fake) fail this processing so that the batch is
|
||||
@@ -330,6 +346,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
}
|
||||
|
||||
/// Processes the next ready batch, prioritizing optimistic batches over the processing target.
|
||||
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
fn process_completed_batches(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -349,7 +366,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
match state {
|
||||
BatchState::AwaitingProcessing(..) => {
|
||||
// this batch is ready
|
||||
debug!(self.log, "Processing optimistic start"; "epoch" => epoch);
|
||||
debug!(%epoch, "Processing optimistic start");
|
||||
return self.process_batch(network, epoch);
|
||||
}
|
||||
BatchState::Downloading(..) => {
|
||||
@@ -377,7 +394,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
// batch has been requested and processed we can land here. We drop the
|
||||
// optimistic candidate since we can't conclude whether the batch included
|
||||
// blocks or not at this point
|
||||
debug!(self.log, "Dropping optimistic candidate"; "batch" => epoch);
|
||||
debug!(batch = %epoch, "Dropping optimistic candidate");
|
||||
self.optimistic_start = None;
|
||||
}
|
||||
}
|
||||
@@ -411,7 +428,10 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
// inside the download buffer (between `self.processing_target` and
|
||||
// `self.to_be_downloaded`). In this case, eventually the chain advances to the
|
||||
// batch (`self.processing_target` reaches this point).
|
||||
debug!(self.log, "Chain encountered a robust batch awaiting validation"; "batch" => self.processing_target);
|
||||
debug!(
|
||||
batch = %self.processing_target,
|
||||
"Chain encountered a robust batch awaiting validation"
|
||||
);
|
||||
|
||||
self.processing_target += EPOCHS_PER_BATCH;
|
||||
if self.to_be_downloaded <= self.processing_target {
|
||||
@@ -436,6 +456,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
|
||||
/// The block processor has completed processing a batch. This function handles the result
|
||||
/// of the batch processor.
|
||||
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
pub fn on_batch_process_result(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -447,13 +468,11 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
let batch_state = self.visualize_batch_state();
|
||||
let batch = match &self.current_processing_batch {
|
||||
Some(processing_id) if *processing_id != batch_id => {
|
||||
debug!(self.log, "Unexpected batch result";
|
||||
"batch_epoch" => batch_id, "expected_batch_epoch" => processing_id);
|
||||
debug!(batch_epoch = %batch_id, expected_batch_epoch = %processing_id,"Unexpected batch result");
|
||||
return Ok(KeepChain);
|
||||
}
|
||||
None => {
|
||||
debug!(self.log, "Chain was not expecting a batch result";
|
||||
"batch_epoch" => batch_id);
|
||||
debug!(batch_epoch = %batch_id,"Chain was not expecting a batch result");
|
||||
return Ok(KeepChain);
|
||||
}
|
||||
_ => {
|
||||
@@ -476,8 +495,14 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
})?;
|
||||
|
||||
// Log the process result and the batch for debugging purposes.
|
||||
debug!(self.log, "Batch processing result"; "result" => ?result, &batch,
|
||||
"batch_epoch" => batch_id, "client" => %network.client_type(&peer), "batch_state" => batch_state);
|
||||
debug!(
|
||||
result = ?result,
|
||||
batch_epoch = %batch_id,
|
||||
client = %network.client_type(&peer),
|
||||
batch_state = ?batch_state,
|
||||
?batch,
|
||||
"Batch processing result"
|
||||
);
|
||||
|
||||
// We consider three cases. Batch was successfully processed, Batch failed processing due
|
||||
// to a faulty peer, or batch failed processing but the peer can't be deemed faulty.
|
||||
@@ -563,10 +588,9 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
// There are some edge cases with forks that could land us in this situation.
|
||||
// This should be unlikely, so we tolerate these errors, but not often.
|
||||
warn!(
|
||||
self.log,
|
||||
"Batch failed to download. Dropping chain scoring peers";
|
||||
"score_adjustment" => %penalty,
|
||||
"batch_epoch"=> batch_id,
|
||||
score_adjustment = %penalty,
|
||||
batch_epoch = %batch_id,
|
||||
"Batch failed to download. Dropping chain scoring peers"
|
||||
);
|
||||
|
||||
for (peer, _) in self.peers.drain() {
|
||||
@@ -587,6 +611,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
fn reject_optimistic_batch(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -599,13 +624,13 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
// it. NOTE: this is done to prevent non-sequential batches coming from optimistic
|
||||
// starts from filling up the buffer size
|
||||
if epoch < self.to_be_downloaded {
|
||||
debug!(self.log, "Rejected optimistic batch left for future use"; "epoch" => %epoch, "reason" => reason);
|
||||
debug!(%epoch, reason, "Rejected optimistic batch left for future use");
|
||||
// this batch is now treated as any other batch, and re-requested for future use
|
||||
if redownload {
|
||||
return self.retry_batch_download(network, epoch);
|
||||
}
|
||||
} else {
|
||||
debug!(self.log, "Rejected optimistic batch"; "epoch" => %epoch, "reason" => reason);
|
||||
debug!(%epoch, reason, "Rejected optimistic batch");
|
||||
self.batches.remove(&epoch);
|
||||
}
|
||||
}
|
||||
@@ -621,6 +646,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
/// If a previous batch has been validated and it had been re-processed, penalize the original
|
||||
/// peer.
|
||||
#[allow(clippy::modulo_one)]
|
||||
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
fn advance_chain(&mut self, network: &mut SyncNetworkContext<T>, validating_epoch: Epoch) {
|
||||
// make sure this epoch produces an advancement
|
||||
if validating_epoch <= self.start_epoch {
|
||||
@@ -629,7 +655,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
|
||||
// safety check for batch boundaries
|
||||
if validating_epoch % EPOCHS_PER_BATCH != self.start_epoch % EPOCHS_PER_BATCH {
|
||||
crit!(self.log, "Validating Epoch is not aligned");
|
||||
crit!("Validating Epoch is not aligned");
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -651,9 +677,10 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
// A different peer sent the correct batch, the previous peer did not
|
||||
// We negatively score the original peer.
|
||||
let action = PeerAction::LowToleranceError;
|
||||
debug!(self.log, "Re-processed batch validated. Scoring original peer";
|
||||
"batch_epoch" => id, "score_adjustment" => %action,
|
||||
"original_peer" => %attempt.peer_id, "new_peer" => %processed_attempt.peer_id
|
||||
debug!(
|
||||
batch_epoch = %id, score_adjustment = %action,
|
||||
original_peer = %attempt.peer_id, new_peer = %processed_attempt.peer_id,
|
||||
"Re-processed batch validated. Scoring original peer"
|
||||
);
|
||||
network.report_peer(
|
||||
attempt.peer_id,
|
||||
@@ -664,9 +691,12 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
// The same peer corrected it's previous mistake. There was an error, so we
|
||||
// negative score the original peer.
|
||||
let action = PeerAction::MidToleranceError;
|
||||
debug!(self.log, "Re-processed batch validated by the same peer";
|
||||
"batch_epoch" => id, "score_adjustment" => %action,
|
||||
"original_peer" => %attempt.peer_id, "new_peer" => %processed_attempt.peer_id
|
||||
debug!(
|
||||
batch_epoch = %id,
|
||||
score_adjustment = %action,
|
||||
original_peer = %attempt.peer_id,
|
||||
new_peer = %processed_attempt.peer_id,
|
||||
"Re-processed batch validated by the same peer"
|
||||
);
|
||||
network.report_peer(
|
||||
attempt.peer_id,
|
||||
@@ -683,13 +713,12 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
active_batches.remove(&id);
|
||||
}
|
||||
}
|
||||
BatchState::Failed | BatchState::Poisoned | BatchState::AwaitingDownload => crit!(
|
||||
self.log,
|
||||
"batch indicates inconsistent chain state while advancing chain"
|
||||
),
|
||||
BatchState::Failed | BatchState::Poisoned | BatchState::AwaitingDownload => {
|
||||
crit!("batch indicates inconsistent chain state while advancing chain")
|
||||
}
|
||||
BatchState::AwaitingProcessing(..) => {}
|
||||
BatchState::Processing(_) => {
|
||||
debug!(self.log, "Advancing chain while processing a batch"; "batch" => id, batch);
|
||||
debug!(batch = %id, %batch, "Advancing chain while processing a batch");
|
||||
if let Some(processing_id) = self.current_processing_batch {
|
||||
if id <= processing_id {
|
||||
self.current_processing_batch = None;
|
||||
@@ -713,8 +742,12 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
self.optimistic_start = None;
|
||||
}
|
||||
}
|
||||
debug!(self.log, "Chain advanced"; "previous_start" => old_start,
|
||||
"new_start" => self.start_epoch, "processing_target" => self.processing_target);
|
||||
debug!(
|
||||
previous_start = %old_start,
|
||||
new_start = %self.start_epoch,
|
||||
processing_target = %self.processing_target,
|
||||
"Chain advanced"
|
||||
);
|
||||
}
|
||||
|
||||
/// An invalid batch has been received that could not be processed, but that can be retried.
|
||||
@@ -722,6 +755,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
/// These events occur when a peer has successfully responded with blocks, but the blocks we
|
||||
/// have received are incorrect or invalid. This indicates the peer has not performed as
|
||||
/// intended and can result in downvoting a peer.
|
||||
#[instrument(parent = None,level = "info", fields(service = self.id, network), skip_all)]
|
||||
fn handle_invalid_batch(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -781,6 +815,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
/// This chain has been requested to start syncing.
|
||||
///
|
||||
/// This could be new chain, or an old chain that is being resumed.
|
||||
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
pub fn start_syncing(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -819,6 +854,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
/// Add a peer to the chain.
|
||||
///
|
||||
/// If the chain is active, this starts requesting batches from this peer.
|
||||
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
pub fn add_peer(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -836,6 +872,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
/// An RPC error has occurred.
|
||||
///
|
||||
/// If the batch exists it is re-requested.
|
||||
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
pub fn inject_error(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -852,24 +889,21 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
// columns.
|
||||
if !batch.is_expecting_block(&request_id) {
|
||||
debug!(
|
||||
self.log,
|
||||
"Batch not expecting block";
|
||||
"batch_epoch" => batch_id,
|
||||
"batch_state" => ?batch.state(),
|
||||
"peer_id" => %peer_id,
|
||||
"request_id" => %request_id,
|
||||
"batch_state" => batch_state
|
||||
batch_epoch = %batch_id,
|
||||
batch_state = ?batch.state(),
|
||||
%peer_id,
|
||||
%request_id,
|
||||
?batch_state,
|
||||
"Batch not expecting block"
|
||||
);
|
||||
return Ok(KeepChain);
|
||||
}
|
||||
debug!(
|
||||
self.log,
|
||||
"Batch failed. RPC Error";
|
||||
"batch_epoch" => batch_id,
|
||||
"batch_state" => ?batch.state(),
|
||||
"peer_id" => %peer_id,
|
||||
"request_id" => %request_id,
|
||||
"batch_state" => batch_state
|
||||
batch_epoch = %batch_id,
|
||||
batch_state = ?batch.state(),
|
||||
%peer_id,
|
||||
%request_id,
|
||||
"Batch failed. RPC Error"
|
||||
);
|
||||
if let Some(active_requests) = self.peers.get_mut(peer_id) {
|
||||
active_requests.remove(&batch_id);
|
||||
@@ -883,12 +917,11 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
self.retry_batch_download(network, batch_id)
|
||||
} else {
|
||||
debug!(
|
||||
self.log,
|
||||
"Batch not found";
|
||||
"batch_epoch" => batch_id,
|
||||
"peer_id" => %peer_id,
|
||||
"request_id" => %request_id,
|
||||
"batch_state" => batch_state
|
||||
batch_epoch = %batch_id,
|
||||
%peer_id,
|
||||
%request_id,
|
||||
batch_state,
|
||||
"Batch not found"
|
||||
);
|
||||
// this could be an error for an old batch, removed when the chain advances
|
||||
Ok(KeepChain)
|
||||
@@ -896,6 +929,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
}
|
||||
|
||||
/// Sends and registers the request of a batch awaiting download.
|
||||
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
pub fn retry_batch_download(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -932,6 +966,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
}
|
||||
|
||||
/// Requests the batch assigned to the given id from a given peer.
|
||||
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
pub fn send_batch(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -958,9 +993,9 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
.map(|epoch| epoch == batch_id)
|
||||
.unwrap_or(false)
|
||||
{
|
||||
debug!(self.log, "Requesting optimistic batch"; "epoch" => batch_id, &batch, "batch_state" => batch_state);
|
||||
debug!(epoch = %batch_id, %batch, %batch_state, "Requesting optimistic batch");
|
||||
} else {
|
||||
debug!(self.log, "Requesting batch"; "epoch" => batch_id, &batch, "batch_state" => batch_state);
|
||||
debug!(epoch = %batch_id, %batch, %batch_state, "Requesting batch");
|
||||
}
|
||||
// register the batch for this peer
|
||||
return self
|
||||
@@ -979,8 +1014,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
}
|
||||
Err(e) => {
|
||||
// NOTE: under normal conditions this shouldn't happen but we handle it anyway
|
||||
warn!(self.log, "Could not send batch request";
|
||||
"batch_id" => batch_id, "error" => ?e, &batch);
|
||||
warn!(%batch_id, error = %e, %batch, "Could not send batch request");
|
||||
// register the failed download and check if the batch can be retried
|
||||
batch.start_downloading_from_peer(peer, 1)?; // fake request_id is not relevant
|
||||
self.peers
|
||||
@@ -1005,6 +1039,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
}
|
||||
|
||||
/// Returns true if this chain is currently syncing.
|
||||
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
pub fn is_syncing(&self) -> bool {
|
||||
match self.state {
|
||||
ChainSyncingState::Syncing => true,
|
||||
@@ -1014,6 +1049,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
|
||||
/// Kickstarts the chain by sending for processing batches that are ready and requesting more
|
||||
/// batches if needed.
|
||||
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
pub fn resume(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -1026,6 +1062,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
|
||||
/// Attempts to request the next required batches from the peer pool if the chain is syncing. It will exhaust the peer
|
||||
/// pool and left over batches until the batch buffer is reached or all peers are exhausted.
|
||||
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
fn request_batches(&mut self, network: &mut SyncNetworkContext<T>) -> ProcessingResult {
|
||||
if !matches!(self.state, ChainSyncingState::Syncing) {
|
||||
return Ok(KeepChain);
|
||||
@@ -1052,10 +1089,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
// We wait for this batch before requesting any other batches.
|
||||
if let Some(epoch) = self.optimistic_start {
|
||||
if !self.good_peers_on_sampling_subnets(epoch, network) {
|
||||
debug!(
|
||||
self.log,
|
||||
"Waiting for peers to be available on sampling column subnets"
|
||||
);
|
||||
debug!("Waiting for peers to be available on sampling column subnets");
|
||||
return Ok(KeepChain);
|
||||
}
|
||||
|
||||
@@ -1114,6 +1148,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
|
||||
/// Creates the next required batch from the chain. If there are no more batches required,
|
||||
/// `false` is returned.
|
||||
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
fn include_next_batch(&mut self, network: &mut SyncNetworkContext<T>) -> Option<BatchId> {
|
||||
// don't request batches beyond the target head slot
|
||||
if self
|
||||
@@ -1147,10 +1182,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
// block and data column requests are currently coupled. This can be removed once we find a
|
||||
// way to decouple the requests and do retries individually, see issue #6258.
|
||||
if !self.good_peers_on_sampling_subnets(self.to_be_downloaded, network) {
|
||||
debug!(
|
||||
self.log,
|
||||
"Waiting for peers to be available on custody column subnets"
|
||||
);
|
||||
debug!("Waiting for peers to be available on custody column subnets");
|
||||
return None;
|
||||
}
|
||||
|
||||
@@ -1177,6 +1209,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
/// This produces a string of the form: [D,E,E,E,E]
|
||||
/// to indicate the current buffer state of the chain. The symbols are defined on each of the
|
||||
/// batch states. See [BatchState::visualize] for symbol definitions.
|
||||
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
|
||||
fn visualize_batch_state(&self) -> String {
|
||||
let mut visualization_string = String::with_capacity((BATCH_BUFFER_SIZE * 3) as usize);
|
||||
|
||||
@@ -1212,45 +1245,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: BeaconChainTypes> slog::KV for &mut SyncingChain<T> {
|
||||
fn serialize(
|
||||
&self,
|
||||
record: &slog::Record,
|
||||
serializer: &mut dyn slog::Serializer,
|
||||
) -> slog::Result {
|
||||
slog::KV::serialize(*self, record, serializer)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: BeaconChainTypes> slog::KV for SyncingChain<T> {
|
||||
fn serialize(
|
||||
&self,
|
||||
record: &slog::Record,
|
||||
serializer: &mut dyn slog::Serializer,
|
||||
) -> slog::Result {
|
||||
use slog::Value;
|
||||
serializer.emit_u32("id", self.id)?;
|
||||
Value::serialize(&self.start_epoch, record, "from", serializer)?;
|
||||
Value::serialize(
|
||||
&self.target_head_slot.epoch(T::EthSpec::slots_per_epoch()),
|
||||
record,
|
||||
"to",
|
||||
serializer,
|
||||
)?;
|
||||
serializer.emit_arguments("end_root", &format_args!("{}", self.target_head_root))?;
|
||||
Value::serialize(
|
||||
&self.processing_target,
|
||||
record,
|
||||
"current_target",
|
||||
serializer,
|
||||
)?;
|
||||
serializer.emit_usize("batches", self.batches.len())?;
|
||||
serializer.emit_usize("peers", self.peers.len())?;
|
||||
serializer.emit_arguments("state", &format_args!("{:?}", self.state))?;
|
||||
slog::Result::Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
use super::batch::WrongState as WrongBatchState;
|
||||
impl From<WrongBatchState> for RemoveChain {
|
||||
fn from(err: WrongBatchState) -> Self {
|
||||
|
||||
@@ -12,11 +12,12 @@ use fnv::FnvHashMap;
|
||||
use lighthouse_network::service::api_types::Id;
|
||||
use lighthouse_network::PeerId;
|
||||
use lighthouse_network::SyncInfo;
|
||||
use slog::{crit, debug, error};
|
||||
use logging::crit;
|
||||
use smallvec::SmallVec;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use tracing::{debug, error};
|
||||
use types::EthSpec;
|
||||
use types::{Epoch, Hash256, Slot};
|
||||
|
||||
@@ -50,18 +51,15 @@ pub struct ChainCollection<T: BeaconChainTypes> {
|
||||
head_chains: FnvHashMap<ChainId, SyncingChain<T>>,
|
||||
/// The current sync state of the process.
|
||||
state: RangeSyncState,
|
||||
/// Logger for the collection.
|
||||
log: slog::Logger,
|
||||
}
|
||||
|
||||
impl<T: BeaconChainTypes> ChainCollection<T> {
|
||||
pub fn new(beacon_chain: Arc<BeaconChain<T>>, log: slog::Logger) -> Self {
|
||||
pub fn new(beacon_chain: Arc<BeaconChain<T>>) -> Self {
|
||||
ChainCollection {
|
||||
beacon_chain,
|
||||
finalized_chains: FnvHashMap::default(),
|
||||
head_chains: FnvHashMap::default(),
|
||||
state: RangeSyncState::Idle,
|
||||
log,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -295,9 +293,8 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
|
||||
.expect("Chain exists");
|
||||
|
||||
match old_id {
|
||||
Some(Some(old_id)) => debug!(self.log, "Switching finalized chains";
|
||||
"old_id" => old_id, &chain),
|
||||
None => debug!(self.log, "Syncing new finalized chain"; &chain),
|
||||
Some(Some(old_id)) => debug!(old_id, %chain, "Switching finalized chains"),
|
||||
None => debug!(%chain, "Syncing new finalized chain"),
|
||||
Some(None) => {
|
||||
// this is the same chain. We try to advance it.
|
||||
}
|
||||
@@ -309,10 +306,10 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
|
||||
if let Err(remove_reason) = chain.start_syncing(network, local_epoch, local_head_epoch)
|
||||
{
|
||||
if remove_reason.is_critical() {
|
||||
crit!(self.log, "Chain removed while switching chains"; "chain" => new_id, "reason" => ?remove_reason);
|
||||
crit!(chain = new_id, reason = ?remove_reason, "Chain removed while switching chains");
|
||||
} else {
|
||||
// this happens only if sending a batch over the `network` fails a lot
|
||||
error!(self.log, "Chain removed while switching chains"; "chain" => new_id, "reason" => ?remove_reason);
|
||||
error!(chain = new_id, reason = ?remove_reason, "Chain removed while switching chains");
|
||||
}
|
||||
self.finalized_chains.remove(&new_id);
|
||||
self.on_chain_removed(&new_id, true, RangeSyncType::Finalized);
|
||||
@@ -330,7 +327,7 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
|
||||
) {
|
||||
// Include the awaiting head peers
|
||||
for (peer_id, peer_sync_info) in awaiting_head_peers.drain() {
|
||||
debug!(self.log, "including head peer");
|
||||
debug!("including head peer");
|
||||
self.add_peer_or_create_chain(
|
||||
local_epoch,
|
||||
peer_sync_info.head_root,
|
||||
@@ -362,16 +359,16 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
|
||||
if syncing_chains.len() < PARALLEL_HEAD_CHAINS {
|
||||
// start this chain if it's not already syncing
|
||||
if !chain.is_syncing() {
|
||||
debug!(self.log, "New head chain started syncing"; &chain);
|
||||
debug!(%chain, "New head chain started syncing");
|
||||
}
|
||||
if let Err(remove_reason) =
|
||||
chain.start_syncing(network, local_epoch, local_head_epoch)
|
||||
{
|
||||
self.head_chains.remove(&id);
|
||||
if remove_reason.is_critical() {
|
||||
crit!(self.log, "Chain removed while switching head chains"; "chain" => id, "reason" => ?remove_reason);
|
||||
crit!(chain = id, reason = ?remove_reason, "Chain removed while switching head chains");
|
||||
} else {
|
||||
error!(self.log, "Chain removed while switching head chains"; "chain" => id, "reason" => ?remove_reason);
|
||||
error!(chain = id, reason = ?remove_reason, "Chain removed while switching head chains");
|
||||
}
|
||||
} else {
|
||||
syncing_chains.push(id);
|
||||
@@ -407,7 +404,6 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
|
||||
.start_slot(T::EthSpec::slots_per_epoch());
|
||||
|
||||
let beacon_chain = &self.beacon_chain;
|
||||
let log_ref = &self.log;
|
||||
|
||||
let is_outdated = |target_slot: &Slot, target_root: &Hash256| {
|
||||
target_slot <= &local_finalized_slot
|
||||
@@ -425,7 +421,7 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
|
||||
if is_outdated(&chain.target_head_slot, &chain.target_head_root)
|
||||
|| chain.available_peers() == 0
|
||||
{
|
||||
debug!(log_ref, "Purging out of finalized chain"; &chain);
|
||||
debug!(%chain, "Purging out of finalized chain");
|
||||
Some((*id, chain.is_syncing(), RangeSyncType::Finalized))
|
||||
} else {
|
||||
None
|
||||
@@ -436,7 +432,7 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
|
||||
if is_outdated(&chain.target_head_slot, &chain.target_head_root)
|
||||
|| chain.available_peers() == 0
|
||||
{
|
||||
debug!(log_ref, "Purging out of date head chain"; &chain);
|
||||
debug!(%chain, "Purging out of date head chain");
|
||||
Some((*id, chain.is_syncing(), RangeSyncType::Head))
|
||||
} else {
|
||||
None
|
||||
@@ -477,14 +473,14 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
|
||||
.find(|(_, chain)| chain.has_same_target(target_head_slot, target_head_root))
|
||||
{
|
||||
Some((&id, chain)) => {
|
||||
debug!(self.log, "Adding peer to known chain"; "peer_id" => %peer, "sync_type" => ?sync_type, "id" => id);
|
||||
debug!(peer_id = %peer, ?sync_type, id, "Adding peer to known chain");
|
||||
debug_assert_eq!(chain.target_head_root, target_head_root);
|
||||
debug_assert_eq!(chain.target_head_slot, target_head_slot);
|
||||
if let Err(remove_reason) = chain.add_peer(network, peer) {
|
||||
if remove_reason.is_critical() {
|
||||
crit!(self.log, "Chain removed after adding peer"; "chain" => id, "reason" => ?remove_reason);
|
||||
crit!(chain = %id, reason = ?remove_reason, "Chain removed after adding peer");
|
||||
} else {
|
||||
error!(self.log, "Chain removed after adding peer"; "chain" => id, "reason" => ?remove_reason);
|
||||
error!(chain = %id, reason = ?remove_reason, "Chain removed after adding peer");
|
||||
}
|
||||
let is_syncing = chain.is_syncing();
|
||||
collection.remove(&id);
|
||||
@@ -501,9 +497,9 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
|
||||
target_head_root,
|
||||
peer,
|
||||
sync_type.into(),
|
||||
&self.log,
|
||||
);
|
||||
debug!(self.log, "New chain added to sync"; "peer_id" => peer_rpr, "sync_type" => ?sync_type, &new_chain);
|
||||
|
||||
debug!(peer_id = peer_rpr, ?sync_type, %new_chain, "New chain added to sync");
|
||||
collection.insert(id, new_chain);
|
||||
metrics::inc_counter_vec(&metrics::SYNCING_CHAINS_ADDED, &[sync_type.as_str()]);
|
||||
self.update_metrics();
|
||||
|
||||
@@ -51,10 +51,11 @@ use beacon_chain::{BeaconChain, BeaconChainTypes};
|
||||
use lighthouse_network::rpc::GoodbyeReason;
|
||||
use lighthouse_network::service::api_types::Id;
|
||||
use lighthouse_network::{PeerId, SyncInfo};
|
||||
use logging::crit;
|
||||
use lru_cache::LRUTimeCache;
|
||||
use slog::{crit, debug, trace, warn};
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use tracing::{debug, instrument, trace, warn};
|
||||
use types::{Epoch, EthSpec, Hash256};
|
||||
|
||||
/// For how long we store failed finalized chains to prevent retries.
|
||||
@@ -74,23 +75,26 @@ pub struct RangeSync<T: BeaconChainTypes> {
|
||||
chains: ChainCollection<T>,
|
||||
/// Chains that have failed and are stored to prevent being retried.
|
||||
failed_chains: LRUTimeCache<Hash256>,
|
||||
/// The syncing logger.
|
||||
log: slog::Logger,
|
||||
}
|
||||
|
||||
impl<T: BeaconChainTypes> RangeSync<T>
|
||||
where
|
||||
T: BeaconChainTypes,
|
||||
{
|
||||
pub fn new(beacon_chain: Arc<BeaconChain<T>>, log: slog::Logger) -> Self {
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(component = "range_sync"),
|
||||
name = "range_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn new(beacon_chain: Arc<BeaconChain<T>>) -> Self {
|
||||
RangeSync {
|
||||
beacon_chain: beacon_chain.clone(),
|
||||
chains: ChainCollection::new(beacon_chain, log.clone()),
|
||||
chains: ChainCollection::new(beacon_chain),
|
||||
failed_chains: LRUTimeCache::new(std::time::Duration::from_secs(
|
||||
FAILED_CHAINS_EXPIRY_SECONDS,
|
||||
)),
|
||||
awaiting_head_peers: HashMap::new(),
|
||||
log,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -99,6 +103,12 @@ where
|
||||
self.failed_chains.keys().copied().collect()
|
||||
}
|
||||
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(component = "range_sync"),
|
||||
name = "range_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn state(&self) -> SyncChainStatus {
|
||||
self.chains.state()
|
||||
}
|
||||
@@ -108,6 +118,12 @@ where
|
||||
/// may need to be synced as a result. A new peer, may increase the peer pool of a finalized
|
||||
/// chain, this may result in a different finalized chain from syncing as finalized chains are
|
||||
/// prioritised by peer-pool size.
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(component = "range_sync"),
|
||||
name = "range_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn add_peer(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -133,14 +149,13 @@ where
|
||||
RangeSyncType::Finalized => {
|
||||
// Make sure we have not recently tried this chain
|
||||
if self.failed_chains.contains(&remote_info.finalized_root) {
|
||||
debug!(self.log, "Disconnecting peer that belongs to previously failed chain";
|
||||
"failed_root" => %remote_info.finalized_root, "peer_id" => %peer_id);
|
||||
debug!(failed_root = ?remote_info.finalized_root, %peer_id,"Disconnecting peer that belongs to previously failed chain");
|
||||
network.goodbye_peer(peer_id, GoodbyeReason::IrrelevantNetwork);
|
||||
return;
|
||||
}
|
||||
|
||||
// Finalized chain search
|
||||
debug!(self.log, "Finalization sync peer joined"; "peer_id" => %peer_id);
|
||||
debug!(%peer_id, "Finalization sync peer joined");
|
||||
self.awaiting_head_peers.remove(&peer_id);
|
||||
|
||||
// Because of our change in finalized sync batch size from 2 to 1 and our transition
|
||||
@@ -171,8 +186,7 @@ where
|
||||
if self.chains.is_finalizing_sync() {
|
||||
// If there are finalized chains to sync, finish these first, before syncing head
|
||||
// chains.
|
||||
trace!(self.log, "Waiting for finalized sync to complete";
|
||||
"peer_id" => %peer_id, "awaiting_head_peers" => &self.awaiting_head_peers.len());
|
||||
trace!(%peer_id, awaiting_head_peers = &self.awaiting_head_peers.len(),"Waiting for finalized sync to complete");
|
||||
self.awaiting_head_peers.insert(peer_id, remote_info);
|
||||
return;
|
||||
}
|
||||
@@ -204,6 +218,12 @@ where
|
||||
///
|
||||
/// This function finds the chain that made this request. Once found, processes the result.
|
||||
/// This request could complete a chain or simply add to its progress.
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(component = "range_sync"),
|
||||
name = "range_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn blocks_by_range_response(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -229,11 +249,17 @@ where
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
|
||||
trace!(%chain_id, "BlocksByRange response for removed chain")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(component = "range_sync"),
|
||||
name = "range_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn handle_block_process_result(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -259,13 +285,19 @@ where
|
||||
}
|
||||
|
||||
Err(_) => {
|
||||
trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
|
||||
trace!(%chain_id, "BlocksByRange response for removed chain")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A peer has disconnected. This removes the peer from any ongoing chains and mappings. A
|
||||
/// disconnected peer could remove a chain
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(component = "range_sync"),
|
||||
name = "range_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn peer_disconnect(&mut self, network: &mut SyncNetworkContext<T>, peer_id: &PeerId) {
|
||||
// if the peer is in the awaiting head mapping, remove it
|
||||
self.awaiting_head_peers.remove(peer_id);
|
||||
@@ -278,6 +310,12 @@ where
|
||||
/// which pool the peer is in. The chain may also have a batch or batches awaiting
|
||||
/// for this peer. If so we mark the batch as failed. The batch may then hit it's maximum
|
||||
/// retries. In this case, we need to remove the chain.
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(component = "range_sync"),
|
||||
name = "range_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn remove_peer(&mut self, network: &mut SyncNetworkContext<T>, peer_id: &PeerId) {
|
||||
for (removed_chain, sync_type, remove_reason) in self
|
||||
.chains
|
||||
@@ -297,6 +335,12 @@ where
|
||||
///
|
||||
/// Check to see if the request corresponds to a pending batch. If so, re-request it if possible, if there have
|
||||
/// been too many failed attempts for the batch, remove the chain.
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(component = "range_sync"),
|
||||
name = "range_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn inject_error(
|
||||
&mut self,
|
||||
network: &mut SyncNetworkContext<T>,
|
||||
@@ -321,11 +365,17 @@ where
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
|
||||
trace!(%chain_id, "BlocksByRange response for removed chain")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(component = "range_sync"),
|
||||
name = "range_sync",
|
||||
skip_all
|
||||
)]
|
||||
fn on_chain_removed(
|
||||
&mut self,
|
||||
chain: SyncingChain<T>,
|
||||
@@ -335,14 +385,18 @@ where
|
||||
op: &'static str,
|
||||
) {
|
||||
if remove_reason.is_critical() {
|
||||
crit!(self.log, "Chain removed"; "sync_type" => ?sync_type, &chain, "reason" => ?remove_reason, "op" => op);
|
||||
crit!(?sync_type, %chain, reason = ?remove_reason,op, "Chain removed");
|
||||
} else {
|
||||
debug!(self.log, "Chain removed"; "sync_type" => ?sync_type, &chain, "reason" => ?remove_reason, "op" => op);
|
||||
debug!(?sync_type, %chain, reason = ?remove_reason,op, "Chain removed");
|
||||
}
|
||||
|
||||
if let RemoveChain::ChainFailed { blacklist, .. } = remove_reason {
|
||||
if RangeSyncType::Finalized == sync_type && blacklist {
|
||||
warn!(self.log, "Chain failed! Syncing to its head won't be retried for at least the next {} seconds", FAILED_CHAINS_EXPIRY_SECONDS; &chain);
|
||||
warn!(
|
||||
%chain,
|
||||
"Chain failed! Syncing to its head won't be retried for at least the next {} seconds",
|
||||
FAILED_CHAINS_EXPIRY_SECONDS
|
||||
);
|
||||
self.failed_chains.insert(chain.target_head_root);
|
||||
}
|
||||
}
|
||||
@@ -369,6 +423,12 @@ where
|
||||
}
|
||||
|
||||
/// Kickstarts sync.
|
||||
#[instrument(parent = None,
|
||||
level = "info",
|
||||
fields(component = "range_sync"),
|
||||
name = "range_sync",
|
||||
skip_all
|
||||
)]
|
||||
pub fn resume(&mut self, network: &mut SyncNetworkContext<T>) {
|
||||
for (removed_chain, sync_type, remove_reason) in
|
||||
self.chains.call_all(|chain| chain.resume(network))
|
||||
|
||||
@@ -19,8 +19,8 @@ use beacon_chain::{
|
||||
block_verification_types::{AsBlock, BlockImportData},
|
||||
data_availability_checker::Availability,
|
||||
test_utils::{
|
||||
build_log, generate_rand_block_and_blobs, generate_rand_block_and_data_columns, test_spec,
|
||||
BeaconChainHarness, EphemeralHarnessType, LoggerType, NumBlobs,
|
||||
generate_rand_block_and_blobs, generate_rand_block_and_data_columns, test_spec,
|
||||
BeaconChainHarness, EphemeralHarnessType, NumBlobs,
|
||||
},
|
||||
validator_monitor::timestamp_now,
|
||||
AvailabilityPendingExecutedBlock, AvailabilityProcessingStatus, BlockError,
|
||||
@@ -37,9 +37,9 @@ use lighthouse_network::{
|
||||
types::SyncState,
|
||||
NetworkConfig, NetworkGlobals, PeerId,
|
||||
};
|
||||
use slog::info;
|
||||
use slot_clock::{SlotClock, TestingSlotClock};
|
||||
use tokio::sync::mpsc;
|
||||
use tracing::info;
|
||||
use types::{
|
||||
data_column_sidecar::ColumnIndex,
|
||||
test_utils::{SeedableRng, TestRandom, XorShiftRng},
|
||||
@@ -55,22 +55,12 @@ type DCByRootId = (SyncRequestId, Vec<ColumnIndex>);
|
||||
|
||||
impl TestRig {
|
||||
pub fn test_setup() -> Self {
|
||||
let logger_type = if cfg!(feature = "test_logger") {
|
||||
LoggerType::Test
|
||||
} else if cfg!(feature = "ci_logger") {
|
||||
LoggerType::CI
|
||||
} else {
|
||||
LoggerType::Null
|
||||
};
|
||||
let log = build_log(slog::Level::Trace, logger_type);
|
||||
|
||||
// Use `fork_from_env` logic to set correct fork epochs
|
||||
let spec = test_spec::<E>();
|
||||
|
||||
// Initialise a new beacon chain
|
||||
let harness = BeaconChainHarness::<EphemeralHarnessType<E>>::builder(E)
|
||||
.spec(Arc::new(spec))
|
||||
.logger(log.clone())
|
||||
.deterministic_keypairs(1)
|
||||
.fresh_ephemeral_store()
|
||||
.mock_execution_layer()
|
||||
@@ -95,7 +85,6 @@ impl TestRig {
|
||||
let network_config = Arc::new(NetworkConfig::default());
|
||||
let globals = Arc::new(NetworkGlobals::new_test_globals(
|
||||
Vec::new(),
|
||||
&log,
|
||||
network_config,
|
||||
chain.spec.clone(),
|
||||
));
|
||||
@@ -104,7 +93,6 @@ impl TestRig {
|
||||
sync_tx,
|
||||
chain.clone(),
|
||||
harness.runtime.task_executor.clone(),
|
||||
log.clone(),
|
||||
);
|
||||
|
||||
let fork_name = chain.spec.fork_name_at_slot::<E>(chain.slot().unwrap());
|
||||
@@ -137,11 +125,9 @@ impl TestRig {
|
||||
required_successes: vec![SAMPLING_REQUIRED_SUCCESSES],
|
||||
},
|
||||
fork_context,
|
||||
log.clone(),
|
||||
),
|
||||
harness,
|
||||
fork_name,
|
||||
log,
|
||||
spec,
|
||||
}
|
||||
}
|
||||
@@ -165,7 +151,7 @@ impl TestRig {
|
||||
}
|
||||
|
||||
pub fn log(&self, msg: &str) {
|
||||
info!(self.log, "TEST_RIG"; "msg" => msg);
|
||||
info!(msg, "TEST_RIG");
|
||||
}
|
||||
|
||||
pub fn after_deneb(&self) -> bool {
|
||||
@@ -2318,11 +2304,6 @@ mod deneb_only {
|
||||
})
|
||||
}
|
||||
|
||||
fn log(self, msg: &str) -> Self {
|
||||
self.rig.log(msg);
|
||||
self
|
||||
}
|
||||
|
||||
fn trigger_unknown_block_from_attestation(mut self) -> Self {
|
||||
let block_root = self.block.canonical_root();
|
||||
self.rig
|
||||
@@ -2626,6 +2607,11 @@ mod deneb_only {
|
||||
.block_imported()
|
||||
}
|
||||
|
||||
fn log(self, msg: &str) -> Self {
|
||||
self.rig.log(msg);
|
||||
self
|
||||
}
|
||||
|
||||
fn parent_block_then_empty_parent_blobs(self) -> Self {
|
||||
self.log(
|
||||
" Return empty blobs for parent, block errors with missing components, downscore",
|
||||
|
||||
@@ -8,7 +8,6 @@ use beacon_chain::test_utils::{BeaconChainHarness, EphemeralHarnessType};
|
||||
use beacon_processor::WorkEvent;
|
||||
use lighthouse_network::NetworkGlobals;
|
||||
use rand_chacha::ChaCha20Rng;
|
||||
use slog::Logger;
|
||||
use slot_clock::ManualSlotClock;
|
||||
use std::sync::Arc;
|
||||
use store::MemoryStore;
|
||||
@@ -64,6 +63,5 @@ struct TestRig {
|
||||
/// `rng` for generating test blocks and blobs.
|
||||
rng: ChaCha20Rng,
|
||||
fork_name: ForkName,
|
||||
log: Logger,
|
||||
spec: Arc<ChainSpec>,
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user