Implement tracing spans for data columm RPC requests and responses (#7831)

#7830
This commit is contained in:
Jimmy Chen
2025-08-21 09:35:51 +10:00
committed by GitHub
parent 2d223575d6
commit f19d4f6af1
18 changed files with 491 additions and 66 deletions

View File

@@ -6,13 +6,14 @@ use beacon_chain::validator_monitor::timestamp_now;
use fnv::FnvHashMap;
use lighthouse_network::PeerId;
use lighthouse_network::service::api_types::{CustodyId, DataColumnsByRootRequester};
use lighthouse_tracing::SPAN_OUTGOING_CUSTODY_REQUEST;
use lru_cache::LRUTimeCache;
use parking_lot::RwLock;
use rand::Rng;
use std::collections::HashSet;
use std::time::{Duration, Instant};
use std::{collections::HashMap, marker::PhantomData, sync::Arc};
use tracing::{debug, warn};
use tracing::{Span, debug, debug_span, field, warn};
use types::{DataColumnSidecar, Hash256, data_column_sidecar::ColumnIndex};
use types::{DataColumnSidecarList, EthSpec};
@@ -34,7 +35,8 @@ pub struct ActiveCustodyRequest<T: BeaconChainTypes> {
failed_peers: LRUTimeCache<PeerId>,
/// Set of peers that claim to have imported this block and their custody columns
lookup_peers: Arc<RwLock<HashSet<PeerId>>>,
/// Span for tracing the lifetime of this request.
span: Span,
_phantom: PhantomData<T>,
}
@@ -55,6 +57,8 @@ pub enum Error {
struct ActiveBatchColumnsRequest {
indices: Vec<ColumnIndex>,
/// Span for tracing the lifetime of this request.
span: Span,
}
pub type CustodyRequestResult<E> =
@@ -67,6 +71,7 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
column_indices: &[ColumnIndex],
lookup_peers: Arc<RwLock<HashSet<PeerId>>>,
) -> Self {
let span = debug_span!(parent: None, SPAN_OUTGOING_CUSTODY_REQUEST, %block_root);
Self {
block_root,
custody_id,
@@ -78,6 +83,7 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
active_batch_columns_requests: <_>::default(),
failed_peers: LRUTimeCache::new(Duration::from_secs(FAILED_PEERS_CACHE_EXPIRY_SECONDS)),
lookup_peers,
span,
_phantom: PhantomData,
}
}
@@ -106,6 +112,8 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
return Ok(None);
};
let _guard = batch_request.span.clone().entered();
match resp {
Ok((data_columns, seen_timestamp)) => {
debug!(
@@ -163,6 +171,11 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
"Custody column peer claims to not have some data"
);
batch_request.span.record(
"missing_column_indexes",
field::debug(missing_column_indexes),
);
self.failed_peers.insert(peer_id);
}
}
@@ -183,6 +196,11 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
.on_download_error_and_mark_failure(req_id)?;
}
batch_request.span.record(
"missing_column_indexes",
field::debug(&batch_request.indices),
);
self.failed_peers.insert(peer_id);
}
};
@@ -194,6 +212,7 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
&mut self,
cx: &mut SyncNetworkContext<T>,
) -> CustodyRequestResult<T::EthSpec> {
let _guard = self.span.clone().entered();
if self.column_requests.values().all(|r| r.is_downloaded()) {
// All requests have completed successfully.
let mut peers = HashMap::<PeerId, Vec<usize>>::new();
@@ -298,6 +317,9 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
match request_result {
LookupRequestResult::RequestSent(req_id) => {
let client = cx.network_globals().client(&peer_id).kind;
let batch_columns_req_span = debug_span!("batch_columns_req", %peer_id, %client, missing_column_indexes = tracing::field::Empty);
let _guard = batch_columns_req_span.clone().entered();
for column_index in &indices {
let column_request = self
.column_requests
@@ -308,8 +330,13 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
column_request.on_download_start(req_id)?;
}
self.active_batch_columns_requests
.insert(req_id, ActiveBatchColumnsRequest { indices });
self.active_batch_columns_requests.insert(
req_id,
ActiveBatchColumnsRequest {
indices,
span: batch_columns_req_span,
},
);
}
LookupRequestResult::NoRequestNeeded(_) => unreachable!(),
LookupRequestResult::Pending(_) => unreachable!(),

View File

@@ -4,6 +4,7 @@ use beacon_chain::validator_monitor::timestamp_now;
use fnv::FnvHashMap;
use lighthouse_network::PeerId;
use strum::IntoStaticStr;
use tracing::Span;
use types::{Hash256, Slot};
pub use blobs_by_range::BlobsByRangeRequestItems;
@@ -50,6 +51,7 @@ struct ActiveRequest<T: ActiveRequestItems> {
peer_id: PeerId,
// Error if the request terminates before receiving max expected responses
expect_max_responses: bool,
span: Span,
}
enum State<T> {
@@ -66,13 +68,22 @@ impl<K: Eq + Hash, T: ActiveRequestItems> ActiveRequests<K, T> {
}
}
pub fn insert(&mut self, id: K, peer_id: PeerId, expect_max_responses: bool, items: T) {
pub fn insert(
&mut self,
id: K,
peer_id: PeerId,
expect_max_responses: bool,
items: T,
span: Span,
) {
let _guard = span.clone().entered();
self.requests.insert(
id,
ActiveRequest {
state: State::Active(items),
peer_id,
expect_max_responses,
span,
},
);
}
@@ -106,6 +117,7 @@ impl<K: Eq + Hash, T: ActiveRequestItems> ActiveRequests<K, T> {
// `ActiveRequestItems` validates the item before appending to its internal state.
RpcEvent::Response(item, seen_timestamp) => {
let request = &mut entry.get_mut();
let _guard = request.span.clone().entered();
match &mut request.state {
State::Active(items) => {
match items.add(item) {
@@ -141,6 +153,7 @@ impl<K: Eq + Hash, T: ActiveRequestItems> ActiveRequests<K, T> {
// After stream termination we must forget about this request, there will be no more
// messages coming from the network
let request = entry.remove();
let _guard = request.span.clone().entered();
match request.state {
// Received a stream termination in a valid sequence, consume items
State::Active(mut items) => {
@@ -162,7 +175,9 @@ impl<K: Eq + Hash, T: ActiveRequestItems> ActiveRequests<K, T> {
RpcEvent::RPCError(e) => {
// After an Error event from the network we must forget about this request as this
// may be the last message for this request.
match entry.remove().state {
let request = entry.remove();
let _guard = request.span.clone().entered();
match request.state {
// Received error while request is still active, propagate error.
State::Active(_) => Some(Err(e.into())),
// Received error after completing the request, ignore the error. This is okay