Integrate tracing (#6339)

Tracing Integration
- [reference](5bbf1859e9/projects/project-ideas.md (L297))


  - [x] replace slog & log with tracing throughout the codebase
- [x] implement custom crit log
- [x] make relevant changes in the formatter
- [x] replace sloggers
- [x] re-write SSE logging components

cc: @macladson @eserilev
This commit is contained in:
ThreeHrSleep
2025-03-13 04:01:05 +05:30
committed by GitHub
parent f23f984f85
commit d60c24ef1c
241 changed files with 9485 additions and 9328 deletions

View File

@@ -13,12 +13,12 @@ metrics = { workspace = true }
num_cpus = { workspace = true }
parking_lot = { workspace = true }
serde = { workspace = true }
slog = { workspace = true }
slot_clock = { workspace = true }
strum = { workspace = true }
task_executor = { workspace = true }
tokio = { workspace = true }
tokio-util = { workspace = true }
tracing = { workspace = true }
types = { workspace = true }
[dev-dependencies]

View File

@@ -44,10 +44,10 @@ use crate::work_reprocessing_queue::{
use futures::stream::{Stream, StreamExt};
use futures::task::Poll;
use lighthouse_network::{MessageId, NetworkGlobals, PeerId};
use logging::crit;
use logging::TimeLatch;
use parking_lot::Mutex;
use serde::{Deserialize, Serialize};
use slog::{crit, debug, error, trace, warn, Logger};
use slot_clock::SlotClock;
use std::cmp;
use std::collections::{HashSet, VecDeque};
@@ -61,6 +61,7 @@ use strum::IntoStaticStr;
use task_executor::TaskExecutor;
use tokio::sync::mpsc;
use tokio::sync::mpsc::error::TrySendError;
use tracing::{debug, error, trace, warn};
use types::{
Attestation, BeaconState, ChainSpec, EthSpec, Hash256, RelativeEpoch, SignedAggregateAndProof,
SingleAttestation, Slot, SubnetId,
@@ -305,14 +306,13 @@ impl<T> FifoQueue<T> {
/// Add a new item to the queue.
///
/// Drops `item` if the queue is full.
pub fn push(&mut self, item: T, item_desc: &str, log: &Logger) {
pub fn push(&mut self, item: T, item_desc: &str) {
if self.queue.len() == self.max_length {
error!(
log,
"Work queue is full";
"msg" => "the system has insufficient resources for load",
"queue_len" => self.max_length,
"queue" => item_desc,
msg = "the system has insufficient resources for load",
queue_len = self.max_length,
queue = item_desc,
"Work queue is full"
)
} else {
self.queue.push_back(item);
@@ -827,7 +827,6 @@ pub struct BeaconProcessor<E: EthSpec> {
pub executor: TaskExecutor,
pub current_workers: usize,
pub config: BeaconProcessorConfig,
pub log: Logger,
}
impl<E: EthSpec> BeaconProcessor<E> {
@@ -938,7 +937,6 @@ impl<E: EthSpec> BeaconProcessor<E> {
work_reprocessing_rx,
&self.executor,
Arc::new(slot_clock),
self.log.clone(),
maximum_gossip_clock_disparity,
)?;
@@ -969,9 +967,8 @@ impl<E: EthSpec> BeaconProcessor<E> {
{
Err(e) => {
warn!(
self.log,
"Unable to queue backfill work event. Will try to process now.";
"error" => %e
error = %e,
"Unable to queue backfill work event. Will try to process now."
);
match e {
TrySendError::Full(reprocess_queue_message)
@@ -982,9 +979,8 @@ impl<E: EthSpec> BeaconProcessor<E> {
) => Some(backfill_batch.into()),
other => {
crit!(
self.log,
"Unexpected queue message type";
"message_type" => other.as_ref()
message_type = other.as_ref(),
"Unexpected queue message type"
);
// This is an unhandled exception, drop the message.
continue;
@@ -1005,11 +1001,7 @@ impl<E: EthSpec> BeaconProcessor<E> {
Some(InboundEvent::WorkEvent(event))
| Some(InboundEvent::ReprocessingWork(event)) => Some(event),
None => {
debug!(
self.log,
"Gossip processor stopped";
"msg" => "stream ended"
);
debug!(msg = "stream ended", "Gossip processor stopped");
break;
}
};
@@ -1050,238 +1042,234 @@ impl<E: EthSpec> BeaconProcessor<E> {
None if can_spawn => {
// Check for chain segments first, they're the most efficient way to get
// blocks into the system.
let work_event: Option<Work<E>> = if let Some(item) =
chain_segment_queue.pop()
{
Some(item)
// Check sync blocks before gossip blocks, since we've already explicitly
// requested these blocks.
} else if let Some(item) = rpc_block_queue.pop() {
Some(item)
} else if let Some(item) = rpc_blob_queue.pop() {
Some(item)
} else if let Some(item) = rpc_custody_column_queue.pop() {
Some(item)
// TODO(das): decide proper prioritization for sampling columns
} else if let Some(item) = rpc_custody_column_queue.pop() {
Some(item)
} else if let Some(item) = rpc_verify_data_column_queue.pop() {
Some(item)
} else if let Some(item) = sampling_result_queue.pop() {
Some(item)
// Check delayed blocks before gossip blocks, the gossip blocks might rely
// on the delayed ones.
} else if let Some(item) = delayed_block_queue.pop() {
Some(item)
// Check gossip blocks before gossip attestations, since a block might be
// required to verify some attestations.
} else if let Some(item) = gossip_block_queue.pop() {
Some(item)
} else if let Some(item) = gossip_blob_queue.pop() {
Some(item)
} else if let Some(item) = gossip_data_column_queue.pop() {
Some(item)
// Check the priority 0 API requests after blocks and blobs, but before attestations.
} else if let Some(item) = api_request_p0_queue.pop() {
Some(item)
// Check the aggregates, *then* the unaggregates since we assume that
// aggregates are more valuable to local validators and effectively give us
// more information with less signature verification time.
} else if aggregate_queue.len() > 0 {
let batch_size = cmp::min(
aggregate_queue.len(),
self.config.max_gossip_aggregate_batch_size,
);
let work_event: Option<Work<E>> =
if let Some(item) = chain_segment_queue.pop() {
Some(item)
// Check sync blocks before gossip blocks, since we've already explicitly
// requested these blocks.
} else if let Some(item) = rpc_block_queue.pop() {
Some(item)
} else if let Some(item) = rpc_blob_queue.pop() {
Some(item)
} else if let Some(item) = rpc_custody_column_queue.pop() {
Some(item)
// TODO(das): decide proper prioritization for sampling columns
} else if let Some(item) = rpc_custody_column_queue.pop() {
Some(item)
} else if let Some(item) = rpc_verify_data_column_queue.pop() {
Some(item)
} else if let Some(item) = sampling_result_queue.pop() {
Some(item)
// Check delayed blocks before gossip blocks, the gossip blocks might rely
// on the delayed ones.
} else if let Some(item) = delayed_block_queue.pop() {
Some(item)
// Check gossip blocks before gossip attestations, since a block might be
// required to verify some attestations.
} else if let Some(item) = gossip_block_queue.pop() {
Some(item)
} else if let Some(item) = gossip_blob_queue.pop() {
Some(item)
} else if let Some(item) = gossip_data_column_queue.pop() {
Some(item)
// Check the priority 0 API requests after blocks and blobs, but before attestations.
} else if let Some(item) = api_request_p0_queue.pop() {
Some(item)
// Check the aggregates, *then* the unaggregates since we assume that
// aggregates are more valuable to local validators and effectively give us
// more information with less signature verification time.
} else if aggregate_queue.len() > 0 {
let batch_size = cmp::min(
aggregate_queue.len(),
self.config.max_gossip_aggregate_batch_size,
);
if batch_size < 2 {
// One single aggregate is in the queue, process it individually.
aggregate_queue.pop()
} else {
// Collect two or more aggregates into a batch, so they can take
// advantage of batch signature verification.
//
// Note: this will convert the `Work::GossipAggregate` item into a
// `Work::GossipAggregateBatch` item.
let mut aggregates = Vec::with_capacity(batch_size);
let mut process_batch_opt = None;
for _ in 0..batch_size {
if let Some(item) = aggregate_queue.pop() {
match item {
Work::GossipAggregate {
aggregate,
process_individual: _,
process_batch,
} => {
aggregates.push(*aggregate);
if process_batch_opt.is_none() {
process_batch_opt = Some(process_batch);
if batch_size < 2 {
// One single aggregate is in the queue, process it individually.
aggregate_queue.pop()
} else {
// Collect two or more aggregates into a batch, so they can take
// advantage of batch signature verification.
//
// Note: this will convert the `Work::GossipAggregate` item into a
// `Work::GossipAggregateBatch` item.
let mut aggregates = Vec::with_capacity(batch_size);
let mut process_batch_opt = None;
for _ in 0..batch_size {
if let Some(item) = aggregate_queue.pop() {
match item {
Work::GossipAggregate {
aggregate,
process_individual: _,
process_batch,
} => {
aggregates.push(*aggregate);
if process_batch_opt.is_none() {
process_batch_opt = Some(process_batch);
}
}
_ => {
error!("Invalid item in aggregate queue");
}
}
_ => {
error!(self.log, "Invalid item in aggregate queue");
}
}
}
}
if let Some(process_batch) = process_batch_opt {
// Process all aggregates with a single worker.
Some(Work::GossipAggregateBatch {
aggregates,
process_batch,
})
} else {
// There is no good reason for this to
// happen, it is a serious logic error.
// Since we only form batches when multiple
// work items exist, we should always have a
// work closure at this point.
crit!(self.log, "Missing aggregate work");
None
}
}
// Check the unaggregated attestation queue.
//
// Potentially use batching.
} else if attestation_queue.len() > 0 {
let batch_size = cmp::min(
attestation_queue.len(),
self.config.max_gossip_attestation_batch_size,
);
if batch_size < 2 {
// One single attestation is in the queue, process it individually.
attestation_queue.pop()
} else {
// Collect two or more attestations into a batch, so they can take
// advantage of batch signature verification.
//
// Note: this will convert the `Work::GossipAttestation` item into a
// `Work::GossipAttestationBatch` item.
let mut attestations = Vec::with_capacity(batch_size);
let mut process_batch_opt = None;
for _ in 0..batch_size {
if let Some(item) = attestation_queue.pop() {
match item {
Work::GossipAttestation {
attestation,
process_individual: _,
process_batch,
} => {
attestations.push(*attestation);
if process_batch_opt.is_none() {
process_batch_opt = Some(process_batch);
}
}
_ => error!(
self.log,
"Invalid item in attestation queue"
),
}
if let Some(process_batch) = process_batch_opt {
// Process all aggregates with a single worker.
Some(Work::GossipAggregateBatch {
aggregates,
process_batch,
})
} else {
// There is no good reason for this to
// happen, it is a serious logic error.
// Since we only form batches when multiple
// work items exist, we should always have a
// work closure at this point.
crit!("Missing aggregate work");
None
}
}
// Check the unaggregated attestation queue.
//
// Potentially use batching.
} else if attestation_queue.len() > 0 {
let batch_size = cmp::min(
attestation_queue.len(),
self.config.max_gossip_attestation_batch_size,
);
if let Some(process_batch) = process_batch_opt {
// Process all attestations with a single worker.
Some(Work::GossipAttestationBatch {
attestations,
process_batch,
})
if batch_size < 2 {
// One single attestation is in the queue, process it individually.
attestation_queue.pop()
} else {
// There is no good reason for this to
// happen, it is a serious logic error.
// Since we only form batches when multiple
// work items exist, we should always have a
// work closure at this point.
crit!(self.log, "Missing attestations work");
None
// Collect two or more attestations into a batch, so they can take
// advantage of batch signature verification.
//
// Note: this will convert the `Work::GossipAttestation` item into a
// `Work::GossipAttestationBatch` item.
let mut attestations = Vec::with_capacity(batch_size);
let mut process_batch_opt = None;
for _ in 0..batch_size {
if let Some(item) = attestation_queue.pop() {
match item {
Work::GossipAttestation {
attestation,
process_individual: _,
process_batch,
} => {
attestations.push(*attestation);
if process_batch_opt.is_none() {
process_batch_opt = Some(process_batch);
}
}
_ => error!("Invalid item in attestation queue"),
}
}
}
if let Some(process_batch) = process_batch_opt {
// Process all attestations with a single worker.
Some(Work::GossipAttestationBatch {
attestations,
process_batch,
})
} else {
// There is no good reason for this to
// happen, it is a serious logic error.
// Since we only form batches when multiple
// work items exist, we should always have a
// work closure at this point.
crit!("Missing attestations work");
None
}
}
}
// Convert any gossip attestations that need to be converted.
} else if let Some(item) = attestation_to_convert_queue.pop() {
Some(item)
// Check sync committee messages after attestations as their rewards are lesser
// and they don't influence fork choice.
} else if let Some(item) = sync_contribution_queue.pop() {
Some(item)
} else if let Some(item) = sync_message_queue.pop() {
Some(item)
// Aggregates and unaggregates queued for re-processing are older and we
// care about fresher ones, so check those first.
} else if let Some(item) = unknown_block_aggregate_queue.pop() {
Some(item)
} else if let Some(item) = unknown_block_attestation_queue.pop() {
Some(item)
// Check RPC methods next. Status messages are needed for sync so
// prioritize them over syncing requests from other peers (BlocksByRange
// and BlocksByRoot)
} else if let Some(item) = status_queue.pop() {
Some(item)
} else if let Some(item) = bbrange_queue.pop() {
Some(item)
} else if let Some(item) = bbroots_queue.pop() {
Some(item)
} else if let Some(item) = blbrange_queue.pop() {
Some(item)
} else if let Some(item) = blbroots_queue.pop() {
Some(item)
} else if let Some(item) = dcbroots_queue.pop() {
Some(item)
} else if let Some(item) = dcbrange_queue.pop() {
Some(item)
// Prioritize sampling requests after block syncing requests
} else if let Some(item) = unknown_block_sampling_request_queue.pop() {
Some(item)
// Check slashings after all other consensus messages so we prioritize
// following head.
//
// Check attester slashings before proposer slashings since they have the
// potential to slash multiple validators at once.
} else if let Some(item) = gossip_attester_slashing_queue.pop() {
Some(item)
} else if let Some(item) = gossip_proposer_slashing_queue.pop() {
Some(item)
// Check exits and address changes late since our validators don't get
// rewards from them.
} else if let Some(item) = gossip_voluntary_exit_queue.pop() {
Some(item)
} else if let Some(item) = gossip_bls_to_execution_change_queue.pop() {
Some(item)
// Check the priority 1 API requests after we've
// processed all the interesting things from the network
// and things required for us to stay in good repute
// with our P2P peers.
} else if let Some(item) = api_request_p1_queue.pop() {
Some(item)
// Handle backfill sync chain segments.
} else if let Some(item) = backfill_chain_segment.pop() {
Some(item)
// Handle light client requests.
} else if let Some(item) = lc_gossip_finality_update_queue.pop() {
Some(item)
} else if let Some(item) = lc_gossip_optimistic_update_queue.pop() {
Some(item)
} else if let Some(item) = unknown_light_client_update_queue.pop() {
Some(item)
} else if let Some(item) = lc_bootstrap_queue.pop() {
Some(item)
} else if let Some(item) = lc_rpc_optimistic_update_queue.pop() {
Some(item)
} else if let Some(item) = lc_rpc_finality_update_queue.pop() {
Some(item)
} else if let Some(item) = lc_update_range_queue.pop() {
Some(item)
// This statement should always be the final else statement.
} else {
// Let the journal know that a worker is freed and there's nothing else
// for it to do.
if let Some(work_journal_tx) = &work_journal_tx {
// We don't care if this message was successfully sent, we only use the journal
// during testing.
let _ = work_journal_tx.try_send(NOTHING_TO_DO);
}
None
};
// Convert any gossip attestations that need to be converted.
} else if let Some(item) = attestation_to_convert_queue.pop() {
Some(item)
// Check sync committee messages after attestations as their rewards are lesser
// and they don't influence fork choice.
} else if let Some(item) = sync_contribution_queue.pop() {
Some(item)
} else if let Some(item) = sync_message_queue.pop() {
Some(item)
// Aggregates and unaggregates queued for re-processing are older and we
// care about fresher ones, so check those first.
} else if let Some(item) = unknown_block_aggregate_queue.pop() {
Some(item)
} else if let Some(item) = unknown_block_attestation_queue.pop() {
Some(item)
// Check RPC methods next. Status messages are needed for sync so
// prioritize them over syncing requests from other peers (BlocksByRange
// and BlocksByRoot)
} else if let Some(item) = status_queue.pop() {
Some(item)
} else if let Some(item) = bbrange_queue.pop() {
Some(item)
} else if let Some(item) = bbroots_queue.pop() {
Some(item)
} else if let Some(item) = blbrange_queue.pop() {
Some(item)
} else if let Some(item) = blbroots_queue.pop() {
Some(item)
} else if let Some(item) = dcbroots_queue.pop() {
Some(item)
} else if let Some(item) = dcbrange_queue.pop() {
Some(item)
// Prioritize sampling requests after block syncing requests
} else if let Some(item) = unknown_block_sampling_request_queue.pop() {
Some(item)
// Check slashings after all other consensus messages so we prioritize
// following head.
//
// Check attester slashings before proposer slashings since they have the
// potential to slash multiple validators at once.
} else if let Some(item) = gossip_attester_slashing_queue.pop() {
Some(item)
} else if let Some(item) = gossip_proposer_slashing_queue.pop() {
Some(item)
// Check exits and address changes late since our validators don't get
// rewards from them.
} else if let Some(item) = gossip_voluntary_exit_queue.pop() {
Some(item)
} else if let Some(item) = gossip_bls_to_execution_change_queue.pop() {
Some(item)
// Check the priority 1 API requests after we've
// processed all the interesting things from the network
// and things required for us to stay in good repute
// with our P2P peers.
} else if let Some(item) = api_request_p1_queue.pop() {
Some(item)
// Handle backfill sync chain segments.
} else if let Some(item) = backfill_chain_segment.pop() {
Some(item)
// Handle light client requests.
} else if let Some(item) = lc_gossip_finality_update_queue.pop() {
Some(item)
} else if let Some(item) = lc_gossip_optimistic_update_queue.pop() {
Some(item)
} else if let Some(item) = unknown_light_client_update_queue.pop() {
Some(item)
} else if let Some(item) = lc_bootstrap_queue.pop() {
Some(item)
} else if let Some(item) = lc_rpc_optimistic_update_queue.pop() {
Some(item)
} else if let Some(item) = lc_rpc_finality_update_queue.pop() {
Some(item)
} else if let Some(item) = lc_update_range_queue.pop() {
Some(item)
// This statement should always be the final else statement.
} else {
// Let the journal know that a worker is freed and there's nothing else
// for it to do.
if let Some(work_journal_tx) = &work_journal_tx {
// We don't care if this message was successfully sent, we only use the journal
// during testing.
let _ = work_journal_tx.try_send(NOTHING_TO_DO);
}
None
};
if let Some(work_event) = work_event {
let work_type = work_event.to_type();
@@ -1296,9 +1284,8 @@ impl<E: EthSpec> BeaconProcessor<E> {
// I cannot see any good reason why this would happen.
None => {
warn!(
self.log,
"Unexpected gossip processor condition";
"msg" => "no new work and cannot spawn worker"
msg = "no new work and cannot spawn worker",
"Unexpected gossip processor condition"
);
None
}
@@ -1313,10 +1300,9 @@ impl<E: EthSpec> BeaconProcessor<E> {
&[work_id],
);
trace!(
self.log,
"Gossip processor skipping work";
"msg" => "chain is syncing",
"work_id" => work_id
msg = "chain is syncing",
work_id = work_id,
"Gossip processor skipping work"
);
None
}
@@ -1335,89 +1321,75 @@ impl<E: EthSpec> BeaconProcessor<E> {
// Attestation batches are formed internally within the
// `BeaconProcessor`, they are not sent from external services.
Work::GossipAttestationBatch { .. } => crit!(
self.log,
"Unsupported inbound event";
"type" => "GossipAttestationBatch"
work_type = "GossipAttestationBatch",
"Unsupported inbound event"
),
Work::GossipAggregate { .. } => aggregate_queue.push(work),
// Aggregate batches are formed internally within the `BeaconProcessor`,
// they are not sent from external services.
Work::GossipAggregateBatch { .. } => crit!(
self.log,
"Unsupported inbound event";
"type" => "GossipAggregateBatch"
),
Work::GossipBlock { .. } => {
gossip_block_queue.push(work, work_id, &self.log)
}
Work::GossipBlobSidecar { .. } => {
gossip_blob_queue.push(work, work_id, &self.log)
Work::GossipAggregateBatch { .. } => {
crit!(
work_type = "GossipAggregateBatch",
"Unsupported inbound event"
)
}
Work::GossipBlock { .. } => gossip_block_queue.push(work, work_id),
Work::GossipBlobSidecar { .. } => gossip_blob_queue.push(work, work_id),
Work::GossipDataColumnSidecar { .. } => {
gossip_data_column_queue.push(work, work_id, &self.log)
gossip_data_column_queue.push(work, work_id)
}
Work::DelayedImportBlock { .. } => {
delayed_block_queue.push(work, work_id, &self.log)
delayed_block_queue.push(work, work_id)
}
Work::GossipVoluntaryExit { .. } => {
gossip_voluntary_exit_queue.push(work, work_id, &self.log)
gossip_voluntary_exit_queue.push(work, work_id)
}
Work::GossipProposerSlashing { .. } => {
gossip_proposer_slashing_queue.push(work, work_id, &self.log)
gossip_proposer_slashing_queue.push(work, work_id)
}
Work::GossipAttesterSlashing { .. } => {
gossip_attester_slashing_queue.push(work, work_id, &self.log)
gossip_attester_slashing_queue.push(work, work_id)
}
Work::GossipSyncSignature { .. } => sync_message_queue.push(work),
Work::GossipSyncContribution { .. } => {
sync_contribution_queue.push(work)
}
Work::GossipLightClientFinalityUpdate { .. } => {
lc_gossip_finality_update_queue.push(work, work_id, &self.log)
lc_gossip_finality_update_queue.push(work, work_id)
}
Work::GossipLightClientOptimisticUpdate { .. } => {
lc_gossip_optimistic_update_queue.push(work, work_id, &self.log)
lc_gossip_optimistic_update_queue.push(work, work_id)
}
Work::RpcBlock { .. } | Work::IgnoredRpcBlock { .. } => {
rpc_block_queue.push(work, work_id, &self.log)
rpc_block_queue.push(work, work_id)
}
Work::RpcBlobs { .. } => rpc_blob_queue.push(work, work_id, &self.log),
Work::RpcBlobs { .. } => rpc_blob_queue.push(work, work_id),
Work::RpcCustodyColumn { .. } => {
rpc_custody_column_queue.push(work, work_id, &self.log)
rpc_custody_column_queue.push(work, work_id)
}
Work::RpcVerifyDataColumn(_) => {
rpc_verify_data_column_queue.push(work, work_id, &self.log)
}
Work::SamplingResult(_) => {
sampling_result_queue.push(work, work_id, &self.log)
}
Work::ChainSegment { .. } => {
chain_segment_queue.push(work, work_id, &self.log)
rpc_verify_data_column_queue.push(work, work_id)
}
Work::SamplingResult(_) => sampling_result_queue.push(work, work_id),
Work::ChainSegment { .. } => chain_segment_queue.push(work, work_id),
Work::ChainSegmentBackfill { .. } => {
backfill_chain_segment.push(work, work_id, &self.log)
}
Work::Status { .. } => status_queue.push(work, work_id, &self.log),
Work::BlocksByRangeRequest { .. } => {
bbrange_queue.push(work, work_id, &self.log)
}
Work::BlocksByRootsRequest { .. } => {
bbroots_queue.push(work, work_id, &self.log)
}
Work::BlobsByRangeRequest { .. } => {
blbrange_queue.push(work, work_id, &self.log)
backfill_chain_segment.push(work, work_id)
}
Work::Status { .. } => status_queue.push(work, work_id),
Work::BlocksByRangeRequest { .. } => bbrange_queue.push(work, work_id),
Work::BlocksByRootsRequest { .. } => bbroots_queue.push(work, work_id),
Work::BlobsByRangeRequest { .. } => blbrange_queue.push(work, work_id),
Work::LightClientBootstrapRequest { .. } => {
lc_bootstrap_queue.push(work, work_id, &self.log)
lc_bootstrap_queue.push(work, work_id)
}
Work::LightClientOptimisticUpdateRequest { .. } => {
lc_rpc_optimistic_update_queue.push(work, work_id, &self.log)
lc_rpc_optimistic_update_queue.push(work, work_id)
}
Work::LightClientFinalityUpdateRequest { .. } => {
lc_rpc_finality_update_queue.push(work, work_id, &self.log)
lc_rpc_finality_update_queue.push(work, work_id)
}
Work::LightClientUpdatesByRangeRequest { .. } => {
lc_update_range_queue.push(work, work_id, &self.log)
lc_update_range_queue.push(work, work_id)
}
Work::UnknownBlockAttestation { .. } => {
unknown_block_attestation_queue.push(work)
@@ -1426,29 +1398,23 @@ impl<E: EthSpec> BeaconProcessor<E> {
unknown_block_aggregate_queue.push(work)
}
Work::GossipBlsToExecutionChange { .. } => {
gossip_bls_to_execution_change_queue.push(work, work_id, &self.log)
}
Work::BlobsByRootsRequest { .. } => {
blbroots_queue.push(work, work_id, &self.log)
gossip_bls_to_execution_change_queue.push(work, work_id)
}
Work::BlobsByRootsRequest { .. } => blbroots_queue.push(work, work_id),
Work::DataColumnsByRootsRequest { .. } => {
dcbroots_queue.push(work, work_id, &self.log)
dcbroots_queue.push(work, work_id)
}
Work::DataColumnsByRangeRequest { .. } => {
dcbrange_queue.push(work, work_id, &self.log)
dcbrange_queue.push(work, work_id)
}
Work::UnknownLightClientOptimisticUpdate { .. } => {
unknown_light_client_update_queue.push(work, work_id, &self.log)
unknown_light_client_update_queue.push(work, work_id)
}
Work::UnknownBlockSamplingRequest { .. } => {
unknown_block_sampling_request_queue.push(work, work_id, &self.log)
}
Work::ApiRequestP0 { .. } => {
api_request_p0_queue.push(work, work_id, &self.log)
}
Work::ApiRequestP1 { .. } => {
api_request_p1_queue.push(work, work_id, &self.log)
unknown_block_sampling_request_queue.push(work, work_id)
}
Work::ApiRequestP0 { .. } => api_request_p0_queue.push(work, work_id),
Work::ApiRequestP1 { .. } => api_request_p1_queue.push(work, work_id),
};
Some(work_type)
}
@@ -1526,19 +1492,17 @@ impl<E: EthSpec> BeaconProcessor<E> {
if aggregate_queue.is_full() && aggregate_debounce.elapsed() {
error!(
self.log,
"Aggregate attestation queue full";
"msg" => "the system has insufficient resources for load",
"queue_len" => aggregate_queue.max_length,
msg = "the system has insufficient resources for load",
queue_len = aggregate_queue.max_length,
"Aggregate attestation queue full"
)
}
if attestation_queue.is_full() && attestation_debounce.elapsed() {
error!(
self.log,
"Attestation queue full";
"msg" => "the system has insufficient resources for load",
"queue_len" => attestation_queue.max_length,
msg = "the system has insufficient resources for load",
queue_len = attestation_queue.max_length,
"Attestation queue full"
)
}
}
@@ -1569,7 +1533,6 @@ impl<E: EthSpec> BeaconProcessor<E> {
let send_idle_on_drop = SendOnDrop {
tx: idle_tx,
_worker_timer: worker_timer,
log: self.log.clone(),
};
let worker_id = self.current_workers;
@@ -1578,10 +1541,9 @@ impl<E: EthSpec> BeaconProcessor<E> {
let executor = self.executor.clone();
trace!(
self.log,
"Spawning beacon processor worker";
"work" => work_id,
"worker" => worker_id,
work = work_id,
worker = worker_id,
"Spawning beacon processor worker"
);
let task_spawner = TaskSpawner {
@@ -1719,8 +1681,8 @@ impl TaskSpawner {
}
}
/// This struct will send a message on `self.tx` when it is dropped. An error will be logged on
/// `self.log` if the send fails (this happens when the node is shutting down).
/// This struct will send a message on `self.tx` when it is dropped. An error will be logged
/// if the send fails (this happens when the node is shutting down).
///
/// ## Purpose
///
@@ -1733,17 +1695,15 @@ pub struct SendOnDrop {
tx: mpsc::Sender<()>,
// The field is unused, but it's here to ensure the timer is dropped once the task has finished.
_worker_timer: Option<metrics::HistogramTimer>,
log: Logger,
}
impl Drop for SendOnDrop {
fn drop(&mut self) {
if let Err(e) = self.tx.try_send(()) {
warn!(
self.log,
"Unable to free worker";
"msg" => "did not free worker, shutdown may be underway",
"error" => %e
msg = "did not free worker, shutdown may be underway",
error = %e,
"Unable to free worker"
)
}
}

View File

@@ -16,8 +16,8 @@ use fnv::FnvHashMap;
use futures::task::Poll;
use futures::{Stream, StreamExt};
use itertools::Itertools;
use logging::crit;
use logging::TimeLatch;
use slog::{crit, debug, error, trace, warn, Logger};
use slot_clock::SlotClock;
use std::collections::{HashMap, HashSet};
use std::future::Future;
@@ -29,6 +29,7 @@ use strum::AsRefStr;
use task_executor::TaskExecutor;
use tokio::sync::mpsc::{self, Receiver, Sender};
use tokio_util::time::delay_queue::{DelayQueue, Key as DelayKey};
use tracing::{debug, error, trace, warn};
use types::{EthSpec, Hash256, Slot};
const TASK_NAME: &str = "beacon_processor_reprocess_queue";
@@ -374,7 +375,6 @@ pub fn spawn_reprocess_scheduler<S: SlotClock + 'static>(
work_reprocessing_rx: Receiver<ReprocessQueueMessage>,
executor: &TaskExecutor,
slot_clock: Arc<S>,
log: Logger,
maximum_gossip_clock_disparity: Duration,
) -> Result<(), String> {
// Sanity check
@@ -386,14 +386,10 @@ pub fn spawn_reprocess_scheduler<S: SlotClock + 'static>(
executor.spawn(
async move {
while let Some(msg) = queue.next().await {
queue.handle_message(msg, &log);
queue.handle_message(msg);
}
debug!(
log,
"Re-process queue stopped";
"msg" => "shutting down"
);
debug!(msg = "shutting down", "Re-process queue stopped");
},
TASK_NAME,
);
@@ -436,7 +432,7 @@ impl<S: SlotClock> ReprocessQueue<S> {
}
}
fn handle_message(&mut self, msg: InboundEvent, log: &Logger) {
fn handle_message(&mut self, msg: InboundEvent) {
use ReprocessQueueMessage::*;
match msg {
// Some block has been indicated as "early" and should be processed when the
@@ -455,10 +451,9 @@ impl<S: SlotClock> ReprocessQueue<S> {
if self.queued_gossip_block_roots.len() >= MAXIMUM_QUEUED_BLOCKS {
if self.early_block_debounce.elapsed() {
warn!(
log,
"Early blocks queue is full";
"queue_size" => MAXIMUM_QUEUED_BLOCKS,
"msg" => "check system clock"
queue_size = MAXIMUM_QUEUED_BLOCKS,
msg = "check system clock",
"Early blocks queue is full"
);
}
// Drop the block.
@@ -490,10 +485,7 @@ impl<S: SlotClock> ReprocessQueue<S> {
.try_send(ReadyWork::Block(early_block))
.is_err()
{
error!(
log,
"Failed to send block";
);
error!("Failed to send block");
}
}
}
@@ -507,10 +499,9 @@ impl<S: SlotClock> ReprocessQueue<S> {
if self.rpc_block_delay_queue.len() >= MAXIMUM_QUEUED_BLOCKS {
if self.rpc_block_debounce.elapsed() {
warn!(
log,
"RPC blocks queue is full";
"queue_size" => MAXIMUM_QUEUED_BLOCKS,
"msg" => "check system clock"
queue_size = MAXIMUM_QUEUED_BLOCKS,
msg = "check system clock",
"RPC blocks queue is full"
);
}
// Return the block to the beacon processor signalling to
@@ -522,10 +513,7 @@ impl<S: SlotClock> ReprocessQueue<S> {
}))
.is_err()
{
error!(
log,
"Failed to send rpc block to beacon processor";
);
error!("Failed to send rpc block to beacon processor");
}
return;
}
@@ -536,29 +524,24 @@ impl<S: SlotClock> ReprocessQueue<S> {
}
InboundEvent::ReadyRpcBlock(queued_rpc_block) => {
debug!(
log,
"Sending rpc block for reprocessing";
"block_root" => %queued_rpc_block.beacon_block_root
%queued_rpc_block.beacon_block_root,
"Sending rpc block for reprocessing"
);
if self
.ready_work_tx
.try_send(ReadyWork::RpcBlock(queued_rpc_block))
.is_err()
{
error!(
log,
"Failed to send rpc block to beacon processor";
);
error!("Failed to send rpc block to beacon processor");
}
}
InboundEvent::Msg(UnknownBlockAggregate(queued_aggregate)) => {
if self.attestations_delay_queue.len() >= MAXIMUM_QUEUED_ATTESTATIONS {
if self.attestation_delay_debounce.elapsed() {
error!(
log,
"Aggregate attestation delay queue is full";
"queue_size" => MAXIMUM_QUEUED_ATTESTATIONS,
"msg" => "check system clock"
queue_size = MAXIMUM_QUEUED_ATTESTATIONS,
msg = "check system clock",
"Aggregate attestation delay queue is full"
);
}
// Drop the attestation.
@@ -588,10 +571,9 @@ impl<S: SlotClock> ReprocessQueue<S> {
if self.attestations_delay_queue.len() >= MAXIMUM_QUEUED_ATTESTATIONS {
if self.attestation_delay_debounce.elapsed() {
error!(
log,
"Attestation delay queue is full";
"queue_size" => MAXIMUM_QUEUED_ATTESTATIONS,
"msg" => "check system clock"
queue_size = MAXIMUM_QUEUED_ATTESTATIONS,
msg = "check system clock",
"Attestation delay queue is full"
);
}
// Drop the attestation.
@@ -623,10 +605,9 @@ impl<S: SlotClock> ReprocessQueue<S> {
if self.lc_updates_delay_queue.len() >= MAXIMUM_QUEUED_LIGHT_CLIENT_UPDATES {
if self.lc_update_delay_debounce.elapsed() {
error!(
log,
"Light client updates delay queue is full";
"queue_size" => MAXIMUM_QUEUED_LIGHT_CLIENT_UPDATES,
"msg" => "check system clock"
queue_size = MAXIMUM_QUEUED_LIGHT_CLIENT_UPDATES,
msg = "check system clock",
"Light client updates delay queue is full"
);
}
// Drop the light client update.
@@ -658,9 +639,8 @@ impl<S: SlotClock> ReprocessQueue<S> {
if self.sampling_requests_delay_queue.len() >= MAXIMUM_QUEUED_SAMPLING_REQUESTS {
if self.sampling_request_delay_debounce.elapsed() {
error!(
log,
"Sampling requests delay queue is full";
"queue_size" => MAXIMUM_QUEUED_SAMPLING_REQUESTS,
queue_size = MAXIMUM_QUEUED_SAMPLING_REQUESTS,
"Sampling requests delay queue is full"
);
}
// Drop the inbound message.
@@ -724,23 +704,21 @@ impl<S: SlotClock> ReprocessQueue<S> {
// There is a mismatch between the attestation ids registered for this
// root and the queued attestations. This should never happen.
error!(
log,
"Unknown queued attestation for block root";
"block_root" => ?block_root,
"att_id" => ?id,
?block_root,
att_id = ?id,
"Unknown queued attestation for block root"
);
}
}
if failed_to_send_count > 0 {
error!(
log,
"Ignored scheduled attestation(s) for block";
"hint" => "system may be overloaded",
"parent_root" => ?parent_root,
"block_root" => ?block_root,
"failed_count" => failed_to_send_count,
"sent_count" => sent_count,
hint = "system may be overloaded",
?parent_root,
?block_root,
failed_count = failed_to_send_count,
sent_count,
"Ignored scheduled attestation(s) for block"
);
}
}
@@ -772,18 +750,17 @@ impl<S: SlotClock> ReprocessQueue<S> {
}
} else {
// This should never happen.
error!(log, "Unknown sampling request for block root"; "block_root" => ?block_root, "id" => ?id);
error!(?block_root, ?id, "Unknown sampling request for block root");
}
}
if failed_to_send_count > 0 {
error!(
log,
"Ignored scheduled sampling requests for block";
"hint" => "system may be overloaded",
"block_root" => ?block_root,
"failed_count" => failed_to_send_count,
"sent_count" => sent_count,
hint = "system may be overloaded",
?block_root,
failed_to_send_count,
sent_count,
"Ignored scheduled sampling requests for block"
);
}
}
@@ -795,10 +772,9 @@ impl<S: SlotClock> ReprocessQueue<S> {
.remove(&parent_root)
{
debug!(
log,
"Dequeuing light client optimistic updates";
"parent_root" => %parent_root,
"count" => queued_lc_id.len(),
%parent_root,
count = queued_lc_id.len(),
"Dequeuing light client optimistic updates"
);
for lc_id in queued_lc_id {
@@ -818,23 +794,16 @@ impl<S: SlotClock> ReprocessQueue<S> {
// Send the work
match self.ready_work_tx.try_send(work) {
Ok(_) => trace!(
log,
"reprocessing light client update sent";
),
Err(_) => error!(
log,
"Failed to send scheduled light client update";
),
Ok(_) => trace!("reprocessing light client update sent"),
Err(_) => error!("Failed to send scheduled light client update"),
}
} else {
// There is a mismatch between the light client update ids registered for this
// root and the queued light client updates. This should never happen.
error!(
log,
"Unknown queued light client update for parent root";
"parent_root" => ?parent_root,
"lc_id" => ?lc_id,
?parent_root,
?lc_id,
"Unknown queued light client update for parent root"
);
}
}
@@ -855,11 +824,7 @@ impl<S: SlotClock> ReprocessQueue<S> {
if !self.queued_gossip_block_roots.remove(&block_root) {
// Log an error to alert that we've made a bad assumption about how this
// program works, but still process the block anyway.
error!(
log,
"Unknown block in delay queue";
"block_root" => ?block_root
);
error!(?block_root, "Unknown block in delay queue");
}
if self
@@ -867,10 +832,7 @@ impl<S: SlotClock> ReprocessQueue<S> {
.try_send(ReadyWork::Block(ready_block))
.is_err()
{
error!(
log,
"Failed to pop queued block";
);
error!("Failed to pop queued block");
}
}
InboundEvent::ReadyAttestation(queued_id) => {
@@ -901,10 +863,9 @@ impl<S: SlotClock> ReprocessQueue<S> {
} {
if self.ready_work_tx.try_send(work).is_err() {
error!(
log,
"Ignored scheduled attestation";
"hint" => "system may be overloaded",
"beacon_block_root" => ?root
hint = "system may be overloaded",
beacon_block_root = ?root,
"Ignored scheduled attestation"
);
}
@@ -929,10 +890,7 @@ impl<S: SlotClock> ReprocessQueue<S> {
},
) {
if self.ready_work_tx.try_send(work).is_err() {
error!(
log,
"Failed to send scheduled light client optimistic update";
);
error!("Failed to send scheduled light client optimistic update");
}
if let Some(queued_lc_updates) = self
@@ -955,11 +913,7 @@ impl<S: SlotClock> ReprocessQueue<S> {
duration.as_millis().to_string()
});
debug!(
log,
"Sending scheduled backfill work";
"millis_from_slot_start" => millis_from_slot_start
);
debug!(%millis_from_slot_start, "Sending scheduled backfill work");
match self
.ready_work_tx
@@ -971,9 +925,8 @@ impl<S: SlotClock> ReprocessQueue<S> {
Err(mpsc::error::TrySendError::Full(ReadyWork::BackfillSync(batch)))
| Err(mpsc::error::TrySendError::Closed(ReadyWork::BackfillSync(batch))) => {
error!(
log,
"Failed to send scheduled backfill work";
"info" => "sending work back to queue"
info = "sending work back to queue",
"Failed to send scheduled backfill work"
);
self.queued_backfill_batches.insert(0, batch);
@@ -984,10 +937,7 @@ impl<S: SlotClock> ReprocessQueue<S> {
}
// The message was not sent and we didn't get the correct
// return result. This is a logic error.
_ => crit!(
log,
"Unexpected return from try_send error";
),
_ => crit!("Unexpected return from try_send error"),
}
}
}
@@ -1057,7 +1007,7 @@ impl<S: SlotClock> ReprocessQueue<S> {
#[cfg(test)]
mod tests {
use super::*;
use logging::test_logger;
use logging::create_test_tracing_subscriber;
use slot_clock::{ManualSlotClock, TestingSlotClock};
use std::ops::Add;
use std::sync::Arc;
@@ -1105,8 +1055,8 @@ mod tests {
// See: https://github.com/sigp/lighthouse/issues/5504#issuecomment-2050930045
#[tokio::test]
async fn backfill_schedule_failed_should_reschedule() {
create_test_tracing_subscriber();
let runtime = TestRuntime::default();
let log = test_logger();
let (work_reprocessing_tx, work_reprocessing_rx) = mpsc::channel(1);
let (ready_work_tx, mut ready_work_rx) = mpsc::channel(1);
let slot_duration = 12;
@@ -1117,7 +1067,6 @@ mod tests {
work_reprocessing_rx,
&runtime.task_executor,
slot_clock.clone(),
log,
Duration::from_millis(500),
)
.unwrap();