Integrate tracing (#6339)

Tracing Integration
- [reference](5bbf1859e9/projects/project-ideas.md (L297))


  - [x] replace slog & log with tracing throughout the codebase
- [x] implement custom crit log
- [x] make relevant changes in the formatter
- [x] replace sloggers
- [x] re-write SSE logging components

cc: @macladson @eserilev
This commit is contained in:
ThreeHrSleep
2025-03-13 04:01:05 +05:30
committed by GitHub
parent f23f984f85
commit d60c24ef1c
241 changed files with 9485 additions and 9328 deletions

View File

@@ -15,8 +15,9 @@ use libp2p::swarm::handler::{
ConnectionEvent, ConnectionHandler, ConnectionHandlerEvent, DialUpgradeError,
FullyNegotiatedInbound, FullyNegotiatedOutbound, StreamUpgradeError, SubstreamProtocol,
};
use libp2p::swarm::Stream;
use slog::{crit, debug, trace};
use libp2p::swarm::{ConnectionId, Stream};
use libp2p::PeerId;
use logging::crit;
use smallvec::SmallVec;
use std::{
collections::{hash_map::Entry, VecDeque},
@@ -27,6 +28,7 @@ use std::{
};
use tokio::time::{sleep, Sleep};
use tokio_util::time::{delay_queue, DelayQueue};
use tracing::{debug, trace};
use types::{EthSpec, ForkContext};
/// The number of times to retry an outbound upgrade in the case of IO errors.
@@ -135,11 +137,11 @@ where
/// Waker, to be sure the handler gets polled when needed.
waker: Option<std::task::Waker>,
/// Logger for handling RPC streams
log: slog::Logger,
/// Timeout that will me used for inbound and outbound responses.
resp_timeout: Duration,
/// Information about this handler for logging purposes.
log_info: (PeerId, ConnectionId),
}
enum HandlerState {
@@ -221,8 +223,9 @@ where
pub fn new(
listen_protocol: SubstreamProtocol<RPCProtocol<E>, ()>,
fork_context: Arc<ForkContext>,
log: &slog::Logger,
resp_timeout: Duration,
peer_id: PeerId,
connection_id: ConnectionId,
) -> Self {
RPCHandler {
listen_protocol,
@@ -240,8 +243,8 @@ where
outbound_io_error_retries: 0,
fork_context,
waker: None,
log: log.clone(),
resp_timeout,
log_info: (peer_id, connection_id),
}
}
@@ -250,7 +253,12 @@ where
fn shutdown(&mut self, goodbye_reason: Option<(Id, GoodbyeReason)>) {
if matches!(self.state, HandlerState::Active) {
if !self.dial_queue.is_empty() {
debug!(self.log, "Starting handler shutdown"; "unsent_queued_requests" => self.dial_queue.len());
debug!(
unsent_queued_requests = self.dial_queue.len(),
peer_id = %self.log_info.0,
connection_id = %self.log_info.1,
"Starting handler shutdown"
);
}
// We now drive to completion communications already dialed/established
while let Some((id, req)) = self.dial_queue.pop() {
@@ -297,8 +305,10 @@ where
let Some(inbound_info) = self.inbound_substreams.get_mut(&inbound_id) else {
if !matches!(response, RpcResponse::StreamTermination(..)) {
// the stream is closed after sending the expected number of responses
trace!(self.log, "Inbound stream has expired. Response not sent";
"response" => %response, "id" => inbound_id);
trace!(%response, id = ?inbound_id,
peer_id = %self.log_info.0,
connection_id = %self.log_info.1,
"Inbound stream has expired. Response not sent");
}
return;
};
@@ -313,8 +323,10 @@ where
if matches!(self.state, HandlerState::Deactivated) {
// we no longer send responses after the handler is deactivated
debug!(self.log, "Response not sent. Deactivated handler";
"response" => %response, "id" => inbound_id);
debug!(%response, id = ?inbound_id,
peer_id = %self.log_info.0,
connection_id = %self.log_info.1,
"Response not sent. Deactivated handler");
return;
}
inbound_info.pending_items.push_back(response);
@@ -381,7 +393,11 @@ where
match delay.as_mut().poll(cx) {
Poll::Ready(_) => {
self.state = HandlerState::Deactivated;
debug!(self.log, "Shutdown timeout elapsed, Handler deactivated");
debug!(
peer_id = %self.log_info.0,
connection_id = %self.log_info.1,
"Shutdown timeout elapsed, Handler deactivated"
);
return Poll::Ready(ConnectionHandlerEvent::NotifyBehaviour(
HandlerEvent::Close(RPCError::Disconnected),
));
@@ -428,7 +444,10 @@ where
outbound_err,
)));
} else {
crit!(self.log, "timed out substream not in the books"; "stream_id" => outbound_id.get_ref());
crit!(
peer_id = %self.log_info.0,
connection_id = %self.log_info.1,
stream_id = ?outbound_id.get_ref(), "timed out substream not in the books");
}
}
@@ -557,10 +576,24 @@ where
// BlocksByRange is the one that typically consumes the most time.
// Its useful to log when the request was completed.
if matches!(info.protocol, Protocol::BlocksByRange) {
debug!(self.log, "BlocksByRange Response sent"; "duration" => Instant::now().duration_since(info.request_start_time).as_secs());
debug!(
peer_id = %self.log_info.0,
connection_id = %self.log_info.1,
duration = Instant::now()
.duration_since(info.request_start_time)
.as_secs(),
"BlocksByRange Response sent"
);
}
if matches!(info.protocol, Protocol::BlobsByRange) {
debug!(self.log, "BlobsByRange Response sent"; "duration" => Instant::now().duration_since(info.request_start_time).as_secs());
debug!(
peer_id = %self.log_info.0,
connection_id = %self.log_info.1,
duration = Instant::now()
.duration_since(info.request_start_time)
.as_secs(),
"BlobsByRange Response sent"
);
}
// There is nothing more to process on this substream as it has
@@ -583,10 +616,20 @@ where
}));
if matches!(info.protocol, Protocol::BlocksByRange) {
debug!(self.log, "BlocksByRange Response failed"; "duration" => info.request_start_time.elapsed().as_secs());
debug!(
peer_id = %self.log_info.0,
connection_id = %self.log_info.1,
duration = info.request_start_time.elapsed().as_secs(),
"BlocksByRange Response failed"
);
}
if matches!(info.protocol, Protocol::BlobsByRange) {
debug!(self.log, "BlobsByRange Response failed"; "duration" => info.request_start_time.elapsed().as_secs());
debug!(
peer_id = %self.log_info.0,
connection_id = %self.log_info.1,
duration = info.request_start_time.elapsed().as_secs(),
"BlobsByRange Response failed"
);
}
break;
}
@@ -695,7 +738,7 @@ where
// stream closed
// if we expected multiple streams send a stream termination,
// else report the stream terminating only.
//trace!(self.log, "RPC Response - stream closed by remote");
//"RPC Response - stream closed by remote");
// drop the stream
let delay_key = &entry.get().delay_key;
let request_id = entry.get().req_id;
@@ -772,7 +815,11 @@ where
}
}
OutboundSubstreamState::Poisoned => {
crit!(self.log, "Poisoned outbound substream");
crit!(
peer_id = %self.log_info.0,
connection_id = %self.log_info.1,
"Poisoned outbound substream"
);
unreachable!("Coding Error: Outbound substream is poisoned")
}
}
@@ -804,7 +851,11 @@ where
&& self.events_out.is_empty()
&& self.dial_negotiated == 0
{
debug!(self.log, "Goodbye sent, Handler deactivated");
debug!(
peer_id = %self.log_info.0,
connection_id = %self.log_info.1,
"Goodbye sent, Handler deactivated"
);
self.state = HandlerState::Deactivated;
return Poll::Ready(ConnectionHandlerEvent::NotifyBehaviour(
HandlerEvent::Close(RPCError::Disconnected),
@@ -997,7 +1048,11 @@ where
)
.is_some()
{
crit!(self.log, "Duplicate outbound substream id"; "id" => self.current_outbound_substream_id);
crit!(
peer_id = %self.log_info.0,
connection_id = %self.log_info.1,
id = ?self.current_outbound_substream_id, "Duplicate outbound substream id");
}
self.current_outbound_substream_id.0 += 1;
}
@@ -1045,17 +1100,6 @@ where
}
}
impl slog::Value for SubstreamId {
fn serialize(
&self,
record: &slog::Record,
key: slog::Key,
serializer: &mut dyn slog::Serializer,
) -> slog::Result {
slog::Value::serialize(&self.0, record, key, serializer)
}
}
/// Creates a future that can be polled that will send any queued message to the peer.
///
/// This function returns the given substream, along with whether it has been closed or not. Any

View File

@@ -864,19 +864,3 @@ impl std::fmt::Display for DataColumnsByRootRequest {
)
}
}
impl slog::KV for StatusMessage {
fn serialize(
&self,
record: &slog::Record,
serializer: &mut dyn slog::Serializer,
) -> slog::Result {
use slog::Value;
serializer.emit_arguments("fork_digest", &format_args!("{:?}", self.fork_digest))?;
Value::serialize(&self.finalized_epoch, record, "finalized_epoch", serializer)?;
serializer.emit_arguments("finalized_root", &format_args!("{}", self.finalized_root))?;
Value::serialize(&self.head_slot, record, "head_slot", serializer)?;
serializer.emit_arguments("head_root", &format_args!("{}", self.head_root))?;
slog::Result::Ok(())
}
}

View File

@@ -13,13 +13,14 @@ use libp2p::swarm::{
};
use libp2p::swarm::{ConnectionClosed, FromSwarm, SubstreamProtocol, THandlerInEvent};
use libp2p::PeerId;
use logging::crit;
use rate_limiter::{RPCRateLimiter as RateLimiter, RateLimitedErr};
use slog::{crit, debug, o, trace};
use std::marker::PhantomData;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use std::task::{Context, Poll};
use std::time::Duration;
use tracing::{debug, instrument, trace};
use types::{EthSpec, ForkContext};
pub(crate) use handler::{HandlerErr, HandlerEvent};
@@ -159,8 +160,6 @@ pub struct RPC<Id: ReqId, E: EthSpec> {
events: Vec<BehaviourAction<Id, E>>,
fork_context: Arc<ForkContext>,
enable_light_client_server: bool,
/// Slog logger for RPC behaviour.
log: slog::Logger,
/// Networking constant values
network_params: NetworkParams,
/// A sequential counter indicating when data gets modified.
@@ -168,25 +167,28 @@ pub struct RPC<Id: ReqId, E: EthSpec> {
}
impl<Id: ReqId, E: EthSpec> RPC<Id, E> {
#[instrument(parent = None,
level = "trace",
fields(service = "libp2p_rpc"),
name = "libp2p_rpc",
skip_all
)]
pub fn new(
fork_context: Arc<ForkContext>,
enable_light_client_server: bool,
inbound_rate_limiter_config: Option<InboundRateLimiterConfig>,
outbound_rate_limiter_config: Option<OutboundRateLimiterConfig>,
log: slog::Logger,
network_params: NetworkParams,
seq_number: u64,
) -> Self {
let log = log.new(o!("service" => "libp2p_rpc"));
let inbound_limiter = inbound_rate_limiter_config.map(|config| {
debug!(log, "Using inbound rate limiting params"; "config" => ?config);
debug!(?config, "Using inbound rate limiting params");
RateLimiter::new_with_config(config.0, fork_context.clone())
.expect("Inbound limiter configuration parameters are valid")
});
let self_limiter = outbound_rate_limiter_config.map(|config| {
SelfRateLimiter::new(config, fork_context.clone(), log.clone())
SelfRateLimiter::new(config, fork_context.clone())
.expect("Configuration parameters are valid")
});
@@ -196,7 +198,6 @@ impl<Id: ReqId, E: EthSpec> RPC<Id, E> {
events: Vec::new(),
fork_context,
enable_light_client_server,
log,
network_params,
seq_number,
}
@@ -205,6 +206,12 @@ impl<Id: ReqId, E: EthSpec> RPC<Id, E> {
/// Sends an RPC response.
///
/// The peer must be connected for this to succeed.
#[instrument(parent = None,
level = "trace",
fields(service = "libp2p_rpc"),
name = "libp2p_rpc",
skip_all
)]
pub fn send_response(
&mut self,
peer_id: PeerId,
@@ -222,6 +229,12 @@ impl<Id: ReqId, E: EthSpec> RPC<Id, E> {
/// Submits an RPC request.
///
/// The peer must be connected for this to succeed.
#[instrument(parent = None,
level = "trace",
fields(service = "libp2p_rpc"),
name = "libp2p_rpc",
skip_all
)]
pub fn send_request(&mut self, peer_id: PeerId, request_id: Id, req: RequestType<E>) {
let event = if let Some(self_limiter) = self.self_limiter.as_mut() {
match self_limiter.allows(peer_id, request_id, req) {
@@ -244,6 +257,12 @@ impl<Id: ReqId, E: EthSpec> RPC<Id, E> {
/// Lighthouse wishes to disconnect from this peer by sending a Goodbye message. This
/// gracefully terminates the RPC behaviour with a goodbye message.
#[instrument(parent = None,
level = "trace",
fields(service = "libp2p_rpc"),
name = "libp2p_rpc",
skip_all
)]
pub fn shutdown(&mut self, peer_id: PeerId, id: Id, reason: GoodbyeReason) {
self.events.push(ToSwarm::NotifyHandler {
peer_id,
@@ -252,16 +271,28 @@ impl<Id: ReqId, E: EthSpec> RPC<Id, E> {
});
}
#[instrument(parent = None,
level = "trace",
fields(service = "libp2p_rpc"),
name = "libp2p_rpc",
skip_all
)]
pub fn update_seq_number(&mut self, seq_number: u64) {
self.seq_number = seq_number
}
/// Send a Ping request to the destination `PeerId` via `ConnectionId`.
#[instrument(parent = None,
level = "trace",
fields(service = "libp2p_rpc"),
name = "libp2p_rpc",
skip_all
)]
pub fn ping(&mut self, peer_id: PeerId, id: Id) {
let ping = Ping {
data: self.seq_number,
};
trace!(self.log, "Sending Ping"; "peer_id" => %peer_id);
trace!(%peer_id, "Sending Ping");
self.send_request(peer_id, id, RequestType::Ping(ping));
}
}
@@ -291,14 +322,13 @@ where
},
(),
);
let log = self
.log
.new(slog::o!("peer_id" => peer_id.to_string(), "connection_id" => connection_id.to_string()));
let handler = RPCHandler::new(
protocol,
self.fork_context.clone(),
&log,
self.network_params.resp_timeout,
peer_id,
connection_id,
);
Ok(handler)
@@ -323,15 +353,12 @@ where
(),
);
let log = self
.log
.new(slog::o!("peer_id" => peer_id.to_string(), "connection_id" => connection_id.to_string()));
let handler = RPCHandler::new(
protocol,
self.fork_context.clone(),
&log,
self.network_params.resp_timeout,
peer_id,
connection_id,
);
Ok(handler)
@@ -421,10 +448,10 @@ where
| Protocol::BlobsByRoot
| Protocol::DataColumnsByRoot
) {
debug!(self.log, "Request too large to process"; "request" => %r#type, "protocol" => %protocol);
debug!(request = %r#type, %protocol, "Request too large to process");
} else {
// Other protocols shouldn't be sending large messages, we should flag the peer kind
crit!(self.log, "Request size too large to ever be processed"; "protocol" => %protocol);
crit!(%protocol, "Request size too large to ever be processed");
}
// send an error code to the peer.
// the handler upon receiving the error code will send it back to the behaviour
@@ -440,8 +467,7 @@ where
return;
}
Err(RateLimitedErr::TooSoon(wait_time)) => {
debug!(self.log, "Request exceeds the rate limit";
"request" => %r#type, "peer_id" => %peer_id, "wait_time_ms" => wait_time.as_millis());
debug!(request = %r#type, %peer_id, wait_time_ms = wait_time.as_millis(), "Request exceeds the rate limit");
// send an error code to the peer.
// the handler upon receiving the error code will send it back to the behaviour
self.send_response(
@@ -462,7 +488,7 @@ where
// If we received a Ping, we queue a Pong response.
if let RequestType::Ping(_) = r#type {
trace!(self.log, "Received Ping, queueing Pong";"connection_id" => %conn_id, "peer_id" => %peer_id);
trace!(connection_id = %conn_id, %peer_id, "Received Ping, queueing Pong");
self.send_response(
peer_id,
(conn_id, substream_id),
@@ -526,53 +552,3 @@ where
Poll::Pending
}
}
impl<Id, E> slog::KV for RPCMessage<Id, E>
where
E: EthSpec,
Id: ReqId,
{
fn serialize(
&self,
_record: &slog::Record,
serializer: &mut dyn slog::Serializer,
) -> slog::Result {
serializer.emit_arguments("peer_id", &format_args!("{}", self.peer_id))?;
match &self.message {
Ok(received) => {
let (msg_kind, protocol) = match received {
RPCReceived::Request(Request { r#type, .. }) => {
("request", r#type.versioned_protocol().protocol())
}
RPCReceived::Response(_, res) => ("response", res.protocol()),
RPCReceived::EndOfStream(_, end) => (
"end_of_stream",
match end {
ResponseTermination::BlocksByRange => Protocol::BlocksByRange,
ResponseTermination::BlocksByRoot => Protocol::BlocksByRoot,
ResponseTermination::BlobsByRange => Protocol::BlobsByRange,
ResponseTermination::BlobsByRoot => Protocol::BlobsByRoot,
ResponseTermination::DataColumnsByRoot => Protocol::DataColumnsByRoot,
ResponseTermination::DataColumnsByRange => Protocol::DataColumnsByRange,
ResponseTermination::LightClientUpdatesByRange => {
Protocol::LightClientUpdatesByRange
}
},
),
};
serializer.emit_str("msg_kind", msg_kind)?;
serializer.emit_arguments("protocol", &format_args!("{}", protocol))?;
}
Err(error) => {
let (msg_kind, protocol) = match &error {
HandlerErr::Inbound { proto, .. } => ("inbound_err", *proto),
HandlerErr::Outbound { proto, .. } => ("outbound_err", *proto),
};
serializer.emit_str("msg_kind", msg_kind)?;
serializer.emit_arguments("protocol", &format_args!("{}", protocol))?;
}
};
slog::Result::Ok(())
}
}

View File

@@ -7,9 +7,10 @@ use std::{
use futures::FutureExt;
use libp2p::{swarm::NotifyHandler, PeerId};
use slog::{crit, debug, Logger};
use logging::crit;
use smallvec::SmallVec;
use tokio_util::time::DelayQueue;
use tracing::debug;
use types::{EthSpec, ForkContext};
use super::{
@@ -36,8 +37,6 @@ pub(crate) struct SelfRateLimiter<Id: ReqId, E: EthSpec> {
limiter: RateLimiter,
/// Requests that are ready to be sent.
ready_requests: SmallVec<[(PeerId, RPCSend<Id, E>); 3]>,
/// Slog logger.
log: Logger,
}
/// Error returned when the rate limiter does not accept a request.
@@ -54,9 +53,8 @@ impl<Id: ReqId, E: EthSpec> SelfRateLimiter<Id, E> {
pub fn new(
config: OutboundRateLimiterConfig,
fork_context: Arc<ForkContext>,
log: Logger,
) -> Result<Self, &'static str> {
debug!(log, "Using self rate limiting params"; "config" => ?config);
debug!(?config, "Using self rate limiting params");
let limiter = RateLimiter::new_with_config(config.0, fork_context)?;
Ok(SelfRateLimiter {
@@ -64,7 +62,6 @@ impl<Id: ReqId, E: EthSpec> SelfRateLimiter<Id, E> {
next_peer_request: Default::default(),
limiter,
ready_requests: Default::default(),
log,
})
}
@@ -84,7 +81,7 @@ impl<Id: ReqId, E: EthSpec> SelfRateLimiter<Id, E> {
return Err(Error::PendingRequests);
}
match Self::try_send_request(&mut self.limiter, peer_id, request_id, req, &self.log) {
match Self::try_send_request(&mut self.limiter, peer_id, request_id, req) {
Err((rate_limited_req, wait_time)) => {
let key = (peer_id, protocol);
self.next_peer_request.insert(key, wait_time);
@@ -107,7 +104,6 @@ impl<Id: ReqId, E: EthSpec> SelfRateLimiter<Id, E> {
peer_id: PeerId,
request_id: Id,
req: RequestType<E>,
log: &Logger,
) -> Result<RPCSend<Id, E>, (QueuedRequest<Id, E>, Duration)> {
match limiter.allows(&peer_id, &req) {
Ok(()) => Ok(RPCSend::Request(request_id, req)),
@@ -118,14 +114,13 @@ impl<Id: ReqId, E: EthSpec> SelfRateLimiter<Id, E> {
// this should never happen with default parameters. Let's just send the request.
// Log a crit since this is a config issue.
crit!(
log,
"Self rate limiting error for a batch that will never fit. Sending request anyway. Check configuration parameters.";
"protocol" => %req.versioned_protocol().protocol()
protocol = %req.versioned_protocol().protocol(),
"Self rate limiting error for a batch that will never fit. Sending request anyway. Check configuration parameters."
);
Ok(RPCSend::Request(request_id, req))
}
RateLimitedErr::TooSoon(wait_time) => {
debug!(log, "Self rate limiting"; "protocol" => %protocol.protocol(), "wait_time_ms" => wait_time.as_millis(), "peer_id" => %peer_id);
debug!(protocol = %protocol.protocol(), wait_time_ms = wait_time.as_millis(), %peer_id, "Self rate limiting");
Err((QueuedRequest { req, request_id }, wait_time))
}
}
@@ -139,8 +134,7 @@ impl<Id: ReqId, E: EthSpec> SelfRateLimiter<Id, E> {
if let Entry::Occupied(mut entry) = self.delayed_requests.entry((peer_id, protocol)) {
let queued_requests = entry.get_mut();
while let Some(QueuedRequest { req, request_id }) = queued_requests.pop_front() {
match Self::try_send_request(&mut self.limiter, peer_id, request_id, req, &self.log)
{
match Self::try_send_request(&mut self.limiter, peer_id, request_id, req) {
Err((rate_limited_req, wait_time)) => {
let key = (peer_id, protocol);
self.next_peer_request.insert(key, wait_time);
@@ -215,13 +209,14 @@ mod tests {
use crate::rpc::{Ping, Protocol, RequestType};
use crate::service::api_types::{AppRequestId, RequestId, SingleLookupReqId, SyncRequestId};
use libp2p::PeerId;
use logging::create_test_tracing_subscriber;
use std::time::Duration;
use types::{EthSpec, ForkContext, Hash256, MainnetEthSpec, Slot};
/// Test that `next_peer_request_ready` correctly maintains the queue.
#[tokio::test]
async fn test_next_peer_request_ready() {
let log = logging::test_logger();
create_test_tracing_subscriber();
let config = OutboundRateLimiterConfig(RateLimiterConfig {
ping_quota: Quota::n_every(1, 2),
..Default::default()
@@ -232,7 +227,7 @@ mod tests {
&MainnetEthSpec::default_spec(),
));
let mut limiter: SelfRateLimiter<RequestId, MainnetEthSpec> =
SelfRateLimiter::new(config, fork_context, log).unwrap();
SelfRateLimiter::new(config, fork_context).unwrap();
let peer_id = PeerId::random();
let lookup_id = 0;