mirror of
https://github.com/sigp/lighthouse.git
synced 2026-03-17 11:52:42 +00:00
Implement tracing spans for data columm RPC requests and responses (#7831)
#7830
This commit is contained in:
@@ -6,13 +6,14 @@ use beacon_chain::validator_monitor::timestamp_now;
|
||||
use fnv::FnvHashMap;
|
||||
use lighthouse_network::PeerId;
|
||||
use lighthouse_network::service::api_types::{CustodyId, DataColumnsByRootRequester};
|
||||
use lighthouse_tracing::SPAN_OUTGOING_CUSTODY_REQUEST;
|
||||
use lru_cache::LRUTimeCache;
|
||||
use parking_lot::RwLock;
|
||||
use rand::Rng;
|
||||
use std::collections::HashSet;
|
||||
use std::time::{Duration, Instant};
|
||||
use std::{collections::HashMap, marker::PhantomData, sync::Arc};
|
||||
use tracing::{debug, warn};
|
||||
use tracing::{Span, debug, debug_span, field, warn};
|
||||
use types::{DataColumnSidecar, Hash256, data_column_sidecar::ColumnIndex};
|
||||
use types::{DataColumnSidecarList, EthSpec};
|
||||
|
||||
@@ -34,7 +35,8 @@ pub struct ActiveCustodyRequest<T: BeaconChainTypes> {
|
||||
failed_peers: LRUTimeCache<PeerId>,
|
||||
/// Set of peers that claim to have imported this block and their custody columns
|
||||
lookup_peers: Arc<RwLock<HashSet<PeerId>>>,
|
||||
|
||||
/// Span for tracing the lifetime of this request.
|
||||
span: Span,
|
||||
_phantom: PhantomData<T>,
|
||||
}
|
||||
|
||||
@@ -55,6 +57,8 @@ pub enum Error {
|
||||
|
||||
struct ActiveBatchColumnsRequest {
|
||||
indices: Vec<ColumnIndex>,
|
||||
/// Span for tracing the lifetime of this request.
|
||||
span: Span,
|
||||
}
|
||||
|
||||
pub type CustodyRequestResult<E> =
|
||||
@@ -67,6 +71,7 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
|
||||
column_indices: &[ColumnIndex],
|
||||
lookup_peers: Arc<RwLock<HashSet<PeerId>>>,
|
||||
) -> Self {
|
||||
let span = debug_span!(parent: None, SPAN_OUTGOING_CUSTODY_REQUEST, %block_root);
|
||||
Self {
|
||||
block_root,
|
||||
custody_id,
|
||||
@@ -78,6 +83,7 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
|
||||
active_batch_columns_requests: <_>::default(),
|
||||
failed_peers: LRUTimeCache::new(Duration::from_secs(FAILED_PEERS_CACHE_EXPIRY_SECONDS)),
|
||||
lookup_peers,
|
||||
span,
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
@@ -106,6 +112,8 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
let _guard = batch_request.span.clone().entered();
|
||||
|
||||
match resp {
|
||||
Ok((data_columns, seen_timestamp)) => {
|
||||
debug!(
|
||||
@@ -163,6 +171,11 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
|
||||
"Custody column peer claims to not have some data"
|
||||
);
|
||||
|
||||
batch_request.span.record(
|
||||
"missing_column_indexes",
|
||||
field::debug(missing_column_indexes),
|
||||
);
|
||||
|
||||
self.failed_peers.insert(peer_id);
|
||||
}
|
||||
}
|
||||
@@ -183,6 +196,11 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
|
||||
.on_download_error_and_mark_failure(req_id)?;
|
||||
}
|
||||
|
||||
batch_request.span.record(
|
||||
"missing_column_indexes",
|
||||
field::debug(&batch_request.indices),
|
||||
);
|
||||
|
||||
self.failed_peers.insert(peer_id);
|
||||
}
|
||||
};
|
||||
@@ -194,6 +212,7 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
|
||||
&mut self,
|
||||
cx: &mut SyncNetworkContext<T>,
|
||||
) -> CustodyRequestResult<T::EthSpec> {
|
||||
let _guard = self.span.clone().entered();
|
||||
if self.column_requests.values().all(|r| r.is_downloaded()) {
|
||||
// All requests have completed successfully.
|
||||
let mut peers = HashMap::<PeerId, Vec<usize>>::new();
|
||||
@@ -298,6 +317,9 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
|
||||
|
||||
match request_result {
|
||||
LookupRequestResult::RequestSent(req_id) => {
|
||||
let client = cx.network_globals().client(&peer_id).kind;
|
||||
let batch_columns_req_span = debug_span!("batch_columns_req", %peer_id, %client, missing_column_indexes = tracing::field::Empty);
|
||||
let _guard = batch_columns_req_span.clone().entered();
|
||||
for column_index in &indices {
|
||||
let column_request = self
|
||||
.column_requests
|
||||
@@ -308,8 +330,13 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
|
||||
column_request.on_download_start(req_id)?;
|
||||
}
|
||||
|
||||
self.active_batch_columns_requests
|
||||
.insert(req_id, ActiveBatchColumnsRequest { indices });
|
||||
self.active_batch_columns_requests.insert(
|
||||
req_id,
|
||||
ActiveBatchColumnsRequest {
|
||||
indices,
|
||||
span: batch_columns_req_span,
|
||||
},
|
||||
);
|
||||
}
|
||||
LookupRequestResult::NoRequestNeeded(_) => unreachable!(),
|
||||
LookupRequestResult::Pending(_) => unreachable!(),
|
||||
|
||||
@@ -4,6 +4,7 @@ use beacon_chain::validator_monitor::timestamp_now;
|
||||
use fnv::FnvHashMap;
|
||||
use lighthouse_network::PeerId;
|
||||
use strum::IntoStaticStr;
|
||||
use tracing::Span;
|
||||
use types::{Hash256, Slot};
|
||||
|
||||
pub use blobs_by_range::BlobsByRangeRequestItems;
|
||||
@@ -50,6 +51,7 @@ struct ActiveRequest<T: ActiveRequestItems> {
|
||||
peer_id: PeerId,
|
||||
// Error if the request terminates before receiving max expected responses
|
||||
expect_max_responses: bool,
|
||||
span: Span,
|
||||
}
|
||||
|
||||
enum State<T> {
|
||||
@@ -66,13 +68,22 @@ impl<K: Eq + Hash, T: ActiveRequestItems> ActiveRequests<K, T> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, id: K, peer_id: PeerId, expect_max_responses: bool, items: T) {
|
||||
pub fn insert(
|
||||
&mut self,
|
||||
id: K,
|
||||
peer_id: PeerId,
|
||||
expect_max_responses: bool,
|
||||
items: T,
|
||||
span: Span,
|
||||
) {
|
||||
let _guard = span.clone().entered();
|
||||
self.requests.insert(
|
||||
id,
|
||||
ActiveRequest {
|
||||
state: State::Active(items),
|
||||
peer_id,
|
||||
expect_max_responses,
|
||||
span,
|
||||
},
|
||||
);
|
||||
}
|
||||
@@ -106,6 +117,7 @@ impl<K: Eq + Hash, T: ActiveRequestItems> ActiveRequests<K, T> {
|
||||
// `ActiveRequestItems` validates the item before appending to its internal state.
|
||||
RpcEvent::Response(item, seen_timestamp) => {
|
||||
let request = &mut entry.get_mut();
|
||||
let _guard = request.span.clone().entered();
|
||||
match &mut request.state {
|
||||
State::Active(items) => {
|
||||
match items.add(item) {
|
||||
@@ -141,6 +153,7 @@ impl<K: Eq + Hash, T: ActiveRequestItems> ActiveRequests<K, T> {
|
||||
// After stream termination we must forget about this request, there will be no more
|
||||
// messages coming from the network
|
||||
let request = entry.remove();
|
||||
let _guard = request.span.clone().entered();
|
||||
match request.state {
|
||||
// Received a stream termination in a valid sequence, consume items
|
||||
State::Active(mut items) => {
|
||||
@@ -162,7 +175,9 @@ impl<K: Eq + Hash, T: ActiveRequestItems> ActiveRequests<K, T> {
|
||||
RpcEvent::RPCError(e) => {
|
||||
// After an Error event from the network we must forget about this request as this
|
||||
// may be the last message for this request.
|
||||
match entry.remove().state {
|
||||
let request = entry.remove();
|
||||
let _guard = request.span.clone().entered();
|
||||
match request.state {
|
||||
// Received error while request is still active, propagate error.
|
||||
State::Active(_) => Some(Err(e.into())),
|
||||
// Received error after completing the request, ignore the error. This is okay
|
||||
|
||||
Reference in New Issue
Block a user