mirror of
https://github.com/sigp/lighthouse.git
synced 2026-03-02 16:21:42 +00:00
Integrate tracing (#6339)
Tracing Integration
- [reference](5bbf1859e9/projects/project-ideas.md (L297))
- [x] replace slog & log with tracing throughout the codebase
- [x] implement custom crit log
- [x] make relevant changes in the formatter
- [x] replace sloggers
- [x] re-write SSE logging components
cc: @macladson @eserilev
This commit is contained in:
@@ -4,17 +4,9 @@ version = "0.1.0"
|
||||
authors = ["Sigma Prime <contact@sigmaprime.io>"]
|
||||
edition = { workspace = true }
|
||||
|
||||
[features]
|
||||
default = ["slog"]
|
||||
slog = ["dep:slog", "dep:sloggers", "dep:logging"]
|
||||
tracing = ["dep:tracing"]
|
||||
|
||||
[dependencies]
|
||||
async-channel = { workspace = true }
|
||||
futures = { workspace = true }
|
||||
logging = { workspace = true, optional = true }
|
||||
metrics = { workspace = true }
|
||||
slog = { workspace = true, optional = true }
|
||||
sloggers = { workspace = true, optional = true }
|
||||
tokio = { workspace = true, features = ["rt-multi-thread", "macros"] }
|
||||
tracing = { workspace = true, optional = true }
|
||||
tracing = { workspace = true }
|
||||
|
||||
@@ -1,20 +1,14 @@
|
||||
mod metrics;
|
||||
#[cfg(not(feature = "tracing"))]
|
||||
pub mod test_utils;
|
||||
|
||||
use futures::channel::mpsc::Sender;
|
||||
use futures::prelude::*;
|
||||
use std::sync::Weak;
|
||||
use tokio::runtime::{Handle, Runtime};
|
||||
use tracing::{debug, instrument};
|
||||
|
||||
pub use tokio::task::JoinHandle;
|
||||
|
||||
// Set up logging framework
|
||||
#[cfg(not(feature = "tracing"))]
|
||||
use slog::{debug, o};
|
||||
#[cfg(feature = "tracing")]
|
||||
use tracing::debug;
|
||||
|
||||
/// Provides a reason when Lighthouse is shut down.
|
||||
#[derive(Copy, Clone, Debug, PartialEq)]
|
||||
pub enum ShutdownReason {
|
||||
@@ -85,8 +79,9 @@ pub struct TaskExecutor {
|
||||
///
|
||||
/// The task must provide a reason for shutting down.
|
||||
signal_tx: Sender<ShutdownReason>,
|
||||
#[cfg(not(feature = "tracing"))]
|
||||
log: slog::Logger,
|
||||
|
||||
/// The name of the service for inclusion in the logger output.
|
||||
service_name: String,
|
||||
}
|
||||
|
||||
impl TaskExecutor {
|
||||
@@ -97,39 +92,29 @@ impl TaskExecutor {
|
||||
/// This function should only be used during testing. In production, prefer to obtain an
|
||||
/// instance of `Self` via a `environment::RuntimeContext` (see the `lighthouse/environment`
|
||||
/// crate).
|
||||
#[instrument(parent = None,level = "info", fields(service = service_name), name = "task_executor", skip_all)]
|
||||
pub fn new<T: Into<HandleProvider>>(
|
||||
handle: T,
|
||||
exit: async_channel::Receiver<()>,
|
||||
#[cfg(not(feature = "tracing"))] log: slog::Logger,
|
||||
signal_tx: Sender<ShutdownReason>,
|
||||
service_name: String,
|
||||
) -> Self {
|
||||
Self {
|
||||
handle_provider: handle.into(),
|
||||
exit,
|
||||
signal_tx,
|
||||
#[cfg(not(feature = "tracing"))]
|
||||
log,
|
||||
service_name,
|
||||
}
|
||||
}
|
||||
|
||||
/// Clones the task executor adding a service name.
|
||||
#[cfg(not(feature = "tracing"))]
|
||||
#[instrument(parent = None,level = "info", fields(service = service_name), name = "task_executor", skip_all)]
|
||||
pub fn clone_with_name(&self, service_name: String) -> Self {
|
||||
TaskExecutor {
|
||||
handle_provider: self.handle_provider.clone(),
|
||||
exit: self.exit.clone(),
|
||||
signal_tx: self.signal_tx.clone(),
|
||||
log: self.log.new(o!("service" => service_name)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Clones the task executor adding a service name.
|
||||
#[cfg(feature = "tracing")]
|
||||
pub fn clone(&self) -> Self {
|
||||
TaskExecutor {
|
||||
handle_provider: self.handle_provider.clone(),
|
||||
exit: self.exit.clone(),
|
||||
signal_tx: self.signal_tx.clone(),
|
||||
service_name,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -139,6 +124,7 @@ impl TaskExecutor {
|
||||
/// The purpose of this function is to create a compile error if some function which previously
|
||||
/// returned `()` starts returning something else. Such a case may otherwise result in
|
||||
/// accidental error suppression.
|
||||
#[instrument(parent = None,level = "info", fields(service = self.service_name), name = "task_executor", skip_all)]
|
||||
pub fn spawn_ignoring_error(
|
||||
&self,
|
||||
task: impl Future<Output = Result<(), ()>> + Send + 'static,
|
||||
@@ -150,6 +136,7 @@ impl TaskExecutor {
|
||||
/// Spawn a task to monitor the completion of another task.
|
||||
///
|
||||
/// If the other task exits by panicking, then the monitor task will shut down the executor.
|
||||
#[instrument(parent = None,level = "info", fields(service = self.service_name), name = "task_executor", skip_all)]
|
||||
fn spawn_monitor<R: Send>(
|
||||
&self,
|
||||
task_handle: impl Future<Output = Result<R, tokio::task::JoinError>> + Send + 'static,
|
||||
@@ -168,13 +155,7 @@ impl TaskExecutor {
|
||||
drop(timer);
|
||||
});
|
||||
} else {
|
||||
#[cfg(not(feature = "tracing"))]
|
||||
debug!(
|
||||
self.log,
|
||||
"Couldn't spawn monitor task. Runtime shutting down"
|
||||
);
|
||||
#[cfg(feature = "tracing")]
|
||||
debug!("Couldn't spawn monitor task. Runtime shutting down");
|
||||
debug!("Couldn't spawn monitor task. Runtime shutting down")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -187,6 +168,7 @@ impl TaskExecutor {
|
||||
/// of a panic, the executor will be shut down via `self.signal_tx`.
|
||||
///
|
||||
/// This function generates prometheus metrics on number of tasks and task duration.
|
||||
#[instrument(parent = None,level = "info", fields(service = self.service_name), name = "task_executor", skip_all)]
|
||||
pub fn spawn(&self, task: impl Future<Output = ()> + Send + 'static, name: &'static str) {
|
||||
if let Some(task_handle) = self.spawn_handle(task, name) {
|
||||
self.spawn_monitor(task_handle, name)
|
||||
@@ -202,6 +184,7 @@ impl TaskExecutor {
|
||||
/// This is useful in cases where the future to be spawned needs to do additional cleanup work when
|
||||
/// the task is completed/canceled (e.g. writing local variables to disk) or the task is created from
|
||||
/// some framework which does its own cleanup (e.g. a hyper server).
|
||||
#[instrument(parent = None,level = "info", fields(service = self.service_name), name = "task_executor", skip_all)]
|
||||
pub fn spawn_without_exit(
|
||||
&self,
|
||||
task: impl Future<Output = ()> + Send + 'static,
|
||||
@@ -218,9 +201,6 @@ impl TaskExecutor {
|
||||
if let Some(handle) = self.handle() {
|
||||
handle.spawn(future);
|
||||
} else {
|
||||
#[cfg(not(feature = "tracing"))]
|
||||
debug!(self.log, "Couldn't spawn task. Runtime shutting down");
|
||||
#[cfg(feature = "tracing")]
|
||||
debug!("Couldn't spawn task. Runtime shutting down");
|
||||
}
|
||||
}
|
||||
@@ -242,16 +222,13 @@ impl TaskExecutor {
|
||||
/// The task is cancelled when the corresponding async-channel is dropped.
|
||||
///
|
||||
/// This function generates prometheus metrics on number of tasks and task duration.
|
||||
#[instrument(parent = None,level = "info", fields(service = self.service_name), name = "task_executor", skip_all)]
|
||||
pub fn spawn_handle<R: Send + 'static>(
|
||||
&self,
|
||||
task: impl Future<Output = R> + Send + 'static,
|
||||
name: &'static str,
|
||||
) -> Option<tokio::task::JoinHandle<Option<R>>> {
|
||||
let exit = self.exit();
|
||||
|
||||
#[cfg(not(feature = "tracing"))]
|
||||
let log = self.log.clone();
|
||||
|
||||
if let Some(int_gauge) = metrics::get_int_gauge(&metrics::ASYNC_TASKS_COUNT, &[name]) {
|
||||
// Task is shutdown before it completes if `exit` receives
|
||||
let int_gauge_1 = int_gauge.clone();
|
||||
@@ -262,9 +239,6 @@ impl TaskExecutor {
|
||||
let result = match future::select(Box::pin(task), exit).await {
|
||||
future::Either::Left((value, _)) => Some(value),
|
||||
future::Either::Right(_) => {
|
||||
#[cfg(not(feature = "tracing"))]
|
||||
debug!(log, "Async task shutdown, exit received"; "task" => name);
|
||||
#[cfg(feature = "tracing")]
|
||||
debug!(task = name, "Async task shutdown, exit received");
|
||||
None
|
||||
}
|
||||
@@ -273,9 +247,6 @@ impl TaskExecutor {
|
||||
result
|
||||
}))
|
||||
} else {
|
||||
#[cfg(not(feature = "tracing"))]
|
||||
debug!(log, "Couldn't spawn task. Runtime shutting down");
|
||||
#[cfg(feature = "tracing")]
|
||||
debug!("Couldn't spawn task. Runtime shutting down");
|
||||
None
|
||||
}
|
||||
@@ -290,6 +261,7 @@ impl TaskExecutor {
|
||||
/// The Future returned behaves like the standard JoinHandle which can return an error if the
|
||||
/// task failed.
|
||||
/// This function generates prometheus metrics on number of tasks and task duration.
|
||||
#[instrument(parent = None,level = "info", fields(service = self.service_name), name = "task_executor", skip_all)]
|
||||
pub fn spawn_blocking_handle<F, R>(
|
||||
&self,
|
||||
task: F,
|
||||
@@ -299,18 +271,12 @@ impl TaskExecutor {
|
||||
F: FnOnce() -> R + Send + 'static,
|
||||
R: Send + 'static,
|
||||
{
|
||||
#[cfg(not(feature = "tracing"))]
|
||||
let log = self.log.clone();
|
||||
|
||||
let timer = metrics::start_timer_vec(&metrics::BLOCKING_TASKS_HISTOGRAM, &[name]);
|
||||
metrics::inc_gauge_vec(&metrics::BLOCKING_TASKS_COUNT, &[name]);
|
||||
|
||||
let join_handle = if let Some(handle) = self.handle() {
|
||||
handle.spawn_blocking(task)
|
||||
} else {
|
||||
#[cfg(not(feature = "tracing"))]
|
||||
debug!(self.log, "Couldn't spawn task. Runtime shutting down");
|
||||
#[cfg(feature = "tracing")]
|
||||
debug!("Couldn't spawn task. Runtime shutting down");
|
||||
return None;
|
||||
};
|
||||
@@ -319,9 +285,6 @@ impl TaskExecutor {
|
||||
let result = match join_handle.await {
|
||||
Ok(result) => Ok(result),
|
||||
Err(error) => {
|
||||
#[cfg(not(feature = "tracing"))]
|
||||
debug!(log, "Blocking task ended unexpectedly"; "error" => %error);
|
||||
#[cfg(feature = "tracing")]
|
||||
debug!(%error, "Blocking task ended unexpectedly");
|
||||
Err(error)
|
||||
}
|
||||
@@ -347,6 +310,7 @@ impl TaskExecutor {
|
||||
/// a `tokio` context present in the thread-local storage due to some `rayon` funkiness. Talk to
|
||||
/// @paulhauner if you plan to use this function in production. He has put metrics in here to
|
||||
/// track any use of it, so don't think you can pull a sneaky one on him.
|
||||
#[instrument(parent = None,level = "info", fields(service = self.service_name), name = "task_executor", skip_all)]
|
||||
pub fn block_on_dangerous<F: Future>(
|
||||
&self,
|
||||
future: F,
|
||||
@@ -354,44 +318,20 @@ impl TaskExecutor {
|
||||
) -> Option<F::Output> {
|
||||
let timer = metrics::start_timer_vec(&metrics::BLOCK_ON_TASKS_HISTOGRAM, &[name]);
|
||||
metrics::inc_gauge_vec(&metrics::BLOCK_ON_TASKS_COUNT, &[name]);
|
||||
#[cfg(not(feature = "tracing"))]
|
||||
let log = self.log.clone();
|
||||
let handle = self.handle()?;
|
||||
let exit = self.exit();
|
||||
#[cfg(not(feature = "tracing"))]
|
||||
debug!(
|
||||
log,
|
||||
"Starting block_on task";
|
||||
"name" => name
|
||||
);
|
||||
|
||||
#[cfg(feature = "tracing")]
|
||||
debug!(name, "Starting block_on task");
|
||||
|
||||
handle.block_on(async {
|
||||
let output = tokio::select! {
|
||||
output = future => {
|
||||
#[cfg(not(feature = "tracing"))]
|
||||
debug!(
|
||||
log,
|
||||
"Completed block_on task";
|
||||
"name" => name
|
||||
);
|
||||
#[cfg(feature = "tracing")]
|
||||
debug!(
|
||||
name,
|
||||
"Completed block_on task"
|
||||
);
|
||||
Some(output)
|
||||
},
|
||||
}
|
||||
_ = exit => {
|
||||
#[cfg(not(feature = "tracing"))]
|
||||
debug!(
|
||||
log,
|
||||
"Cancelled block_on task";
|
||||
"name" => name,
|
||||
);
|
||||
#[cfg(feature = "tracing")]
|
||||
debug!(
|
||||
name,
|
||||
"Cancelled block_on task"
|
||||
@@ -406,6 +346,7 @@ impl TaskExecutor {
|
||||
}
|
||||
|
||||
/// Returns a `Handle` to the current runtime.
|
||||
#[instrument(parent = None,level = "info", fields(service = self.service_name), name = "task_executor", skip_all)]
|
||||
pub fn handle(&self) -> Option<Handle> {
|
||||
self.handle_provider.handle()
|
||||
}
|
||||
@@ -420,13 +361,8 @@ impl TaskExecutor {
|
||||
}
|
||||
|
||||
/// Get a channel to request shutting down.
|
||||
#[instrument(parent = None,level = "info", fields(service = self.service_name), name = "task_executor", skip_all)]
|
||||
pub fn shutdown_sender(&self) -> Sender<ShutdownReason> {
|
||||
self.signal_tx.clone()
|
||||
}
|
||||
|
||||
/// Returns a reference to the logger.
|
||||
#[cfg(not(feature = "tracing"))]
|
||||
pub fn log(&self) -> &slog::Logger {
|
||||
&self.log
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
use crate::TaskExecutor;
|
||||
pub use logging::test_logger;
|
||||
use slog::Logger;
|
||||
use std::sync::Arc;
|
||||
use tokio::runtime;
|
||||
|
||||
@@ -16,7 +14,6 @@ pub struct TestRuntime {
|
||||
runtime: Option<Arc<tokio::runtime::Runtime>>,
|
||||
_runtime_shutdown: async_channel::Sender<()>,
|
||||
pub task_executor: TaskExecutor,
|
||||
pub log: Logger,
|
||||
}
|
||||
|
||||
impl Default for TestRuntime {
|
||||
@@ -26,7 +23,6 @@ impl Default for TestRuntime {
|
||||
fn default() -> Self {
|
||||
let (runtime_shutdown, exit) = async_channel::bounded(1);
|
||||
let (shutdown_tx, _) = futures::channel::mpsc::channel(1);
|
||||
let log = test_logger();
|
||||
|
||||
let (runtime, handle) = if let Ok(handle) = runtime::Handle::try_current() {
|
||||
(None, handle)
|
||||
@@ -41,13 +37,12 @@ impl Default for TestRuntime {
|
||||
(Some(runtime), handle)
|
||||
};
|
||||
|
||||
let task_executor = TaskExecutor::new(handle, exit, log.clone(), shutdown_tx);
|
||||
let task_executor = TaskExecutor::new(handle, exit, shutdown_tx, "test".to_string());
|
||||
|
||||
Self {
|
||||
runtime,
|
||||
_runtime_shutdown: runtime_shutdown,
|
||||
task_executor,
|
||||
log,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -59,10 +54,3 @@ impl Drop for TestRuntime {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TestRuntime {
|
||||
pub fn set_logger(&mut self, log: Logger) {
|
||||
self.log = log.clone();
|
||||
self.task_executor.log = log;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user