Modularize tracing executor and metrics rename (#6424)

* Tracing executor and metrics rename

* Appease clippy

* Merge branch 'unstable' into modularise-task-executor
This commit is contained in:
Age Manning
2024-10-28 20:41:45 +11:00
committed by GitHub
parent 8188e036a0
commit e31ac508d4
59 changed files with 364 additions and 323 deletions

View File

@@ -1,10 +0,0 @@
[package]
name = "lighthouse_metrics"
version = "0.2.0"
authors = ["Paul Hauner <paul@paulhauner.com>"]
edition = { workspace = true }
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
prometheus = "0.13.0"

View File

@@ -9,7 +9,7 @@ test_logger = [] # Print log output to stderr when running tests instead of drop
[dependencies]
chrono = { version = "0.4", default-features = false, features = ["clock", "std"] }
lighthouse_metrics = { workspace = true }
metrics = { workspace = true }
parking_lot = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }

View File

@@ -1,6 +1,4 @@
use lighthouse_metrics::{
inc_counter, try_create_int_counter, IntCounter, Result as MetricsResult,
};
use metrics::{inc_counter, try_create_int_counter, IntCounter, Result as MetricsResult};
use slog::Logger;
use slog_term::Decorator;
use std::io::{Result, Write};

View File

@@ -1,6 +1,5 @@
//! Exposes [`MetricsLayer`]: A tracing layer that registers metrics of logging events.
use lighthouse_metrics as metrics;
use std::sync::LazyLock;
use tracing_log::NormalizeEvent;

View File

@@ -5,7 +5,7 @@ authors = ["Paul Hauner <paul@paulhauner.com>"]
edition = { workspace = true }
[dependencies]
lighthouse_metrics = { workspace = true }
metrics = { workspace = true }
libc = "0.2.79"
parking_lot = { workspace = true }
tikv-jemalloc-ctl = { version = "0.6.0", optional = true, features = ["stats"] }

View File

@@ -4,7 +4,7 @@
//! https://www.gnu.org/software/libc/manual/html_node/The-GNU-Allocator.html
//!
//! These functions are generally only suitable for Linux systems.
use lighthouse_metrics::*;
use metrics::*;
use parking_lot::Mutex;
use std::env;
use std::os::raw::c_int;
@@ -38,60 +38,57 @@ pub static GLOBAL_LOCK: LazyLock<Mutex<()>> = LazyLock::new(|| <_>::default());
// Metrics for the malloc. For more information, see:
//
// https://man7.org/linux/man-pages/man3/mallinfo.3.html
pub static MALLINFO_ARENA: LazyLock<lighthouse_metrics::Result<IntGauge>> = LazyLock::new(|| {
pub static MALLINFO_ARENA: LazyLock<metrics::Result<IntGauge>> = LazyLock::new(|| {
try_create_int_gauge(
"mallinfo_arena",
"The total amount of memory allocated by means other than mmap(2). \
This figure includes both in-use blocks and blocks on the free list.",
)
});
pub static MALLINFO_ORDBLKS: LazyLock<lighthouse_metrics::Result<IntGauge>> = LazyLock::new(|| {
pub static MALLINFO_ORDBLKS: LazyLock<metrics::Result<IntGauge>> = LazyLock::new(|| {
try_create_int_gauge(
"mallinfo_ordblks",
"The number of ordinary (i.e., non-fastbin) free blocks.",
)
});
pub static MALLINFO_SMBLKS: LazyLock<lighthouse_metrics::Result<IntGauge>> =
pub static MALLINFO_SMBLKS: LazyLock<metrics::Result<IntGauge>> =
LazyLock::new(|| try_create_int_gauge("mallinfo_smblks", "The number of fastbin free blocks."));
pub static MALLINFO_HBLKS: LazyLock<lighthouse_metrics::Result<IntGauge>> = LazyLock::new(|| {
pub static MALLINFO_HBLKS: LazyLock<metrics::Result<IntGauge>> = LazyLock::new(|| {
try_create_int_gauge(
"mallinfo_hblks",
"The number of blocks currently allocated using mmap.",
)
});
pub static MALLINFO_HBLKHD: LazyLock<lighthouse_metrics::Result<IntGauge>> = LazyLock::new(|| {
pub static MALLINFO_HBLKHD: LazyLock<metrics::Result<IntGauge>> = LazyLock::new(|| {
try_create_int_gauge(
"mallinfo_hblkhd",
"The number of bytes in blocks currently allocated using mmap.",
)
});
pub static MALLINFO_FSMBLKS: LazyLock<lighthouse_metrics::Result<IntGauge>> = LazyLock::new(|| {
pub static MALLINFO_FSMBLKS: LazyLock<metrics::Result<IntGauge>> = LazyLock::new(|| {
try_create_int_gauge(
"mallinfo_fsmblks",
"The total number of bytes in fastbin free blocks.",
)
});
pub static MALLINFO_UORDBLKS: LazyLock<lighthouse_metrics::Result<IntGauge>> =
LazyLock::new(|| {
try_create_int_gauge(
"mallinfo_uordblks",
"The total number of bytes used by in-use allocations.",
)
});
pub static MALLINFO_FORDBLKS: LazyLock<lighthouse_metrics::Result<IntGauge>> =
LazyLock::new(|| {
try_create_int_gauge(
"mallinfo_fordblks",
"The total number of bytes in free blocks.",
)
});
pub static MALLINFO_KEEPCOST: LazyLock<lighthouse_metrics::Result<IntGauge>> =
LazyLock::new(|| {
try_create_int_gauge(
"mallinfo_keepcost",
"The total amount of releasable free space at the top of the heap..",
)
});
pub static MALLINFO_UORDBLKS: LazyLock<metrics::Result<IntGauge>> = LazyLock::new(|| {
try_create_int_gauge(
"mallinfo_uordblks",
"The total number of bytes used by in-use allocations.",
)
});
pub static MALLINFO_FORDBLKS: LazyLock<metrics::Result<IntGauge>> = LazyLock::new(|| {
try_create_int_gauge(
"mallinfo_fordblks",
"The total number of bytes in free blocks.",
)
});
pub static MALLINFO_KEEPCOST: LazyLock<metrics::Result<IntGauge>> = LazyLock::new(|| {
try_create_int_gauge(
"mallinfo_keepcost",
"The total amount of releasable free space at the top of the heap..",
)
});
/// Calls `mallinfo` and updates Prometheus metrics with the results.
pub fn scrape_mallinfo_metrics() {

View File

@@ -7,7 +7,7 @@
//!
//! A) `JEMALLOC_SYS_WITH_MALLOC_CONF` at compile-time.
//! B) `_RJEM_MALLOC_CONF` at runtime.
use lighthouse_metrics::{set_gauge, try_create_int_gauge, IntGauge};
use metrics::{set_gauge, try_create_int_gauge, IntGauge};
use std::sync::LazyLock;
use tikv_jemalloc_ctl::{arenas, epoch, stats, Error};
@@ -15,22 +15,22 @@ use tikv_jemalloc_ctl::{arenas, epoch, stats, Error};
static ALLOC: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
// Metrics for jemalloc.
pub static NUM_ARENAS: LazyLock<lighthouse_metrics::Result<IntGauge>> =
pub static NUM_ARENAS: LazyLock<metrics::Result<IntGauge>> =
LazyLock::new(|| try_create_int_gauge("jemalloc_num_arenas", "The number of arenas in use"));
pub static BYTES_ALLOCATED: LazyLock<lighthouse_metrics::Result<IntGauge>> = LazyLock::new(|| {
pub static BYTES_ALLOCATED: LazyLock<metrics::Result<IntGauge>> = LazyLock::new(|| {
try_create_int_gauge("jemalloc_bytes_allocated", "Equivalent to stats.allocated")
});
pub static BYTES_ACTIVE: LazyLock<lighthouse_metrics::Result<IntGauge>> =
pub static BYTES_ACTIVE: LazyLock<metrics::Result<IntGauge>> =
LazyLock::new(|| try_create_int_gauge("jemalloc_bytes_active", "Equivalent to stats.active"));
pub static BYTES_MAPPED: LazyLock<lighthouse_metrics::Result<IntGauge>> =
pub static BYTES_MAPPED: LazyLock<metrics::Result<IntGauge>> =
LazyLock::new(|| try_create_int_gauge("jemalloc_bytes_mapped", "Equivalent to stats.mapped"));
pub static BYTES_METADATA: LazyLock<lighthouse_metrics::Result<IntGauge>> = LazyLock::new(|| {
pub static BYTES_METADATA: LazyLock<metrics::Result<IntGauge>> = LazyLock::new(|| {
try_create_int_gauge("jemalloc_bytes_metadata", "Equivalent to stats.metadata")
});
pub static BYTES_RESIDENT: LazyLock<lighthouse_metrics::Result<IntGauge>> = LazyLock::new(|| {
pub static BYTES_RESIDENT: LazyLock<metrics::Result<IntGauge>> = LazyLock::new(|| {
try_create_int_gauge("jemalloc_bytes_resident", "Equivalent to stats.resident")
});
pub static BYTES_RETAINED: LazyLock<lighthouse_metrics::Result<IntGauge>> = LazyLock::new(|| {
pub static BYTES_RETAINED: LazyLock<metrics::Result<IntGauge>> = LazyLock::new(|| {
try_create_int_gauge("jemalloc_bytes_retained", "Equivalent to stats.retained")
});

View File

@@ -0,0 +1,7 @@
[package]
name = "metrics"
version = "0.2.0"
edition = { workspace = true }
[dependencies]
prometheus = { workspace = true }

View File

@@ -20,10 +20,10 @@
//! ## Example
//!
//! ```rust
//! use lighthouse_metrics::*;
//! use metrics::*;
//! use std::sync::LazyLock;
//!
//! // These metrics are "magically" linked to the global registry defined in `lighthouse_metrics`.
//! // These metrics are "magically" linked to the global registry defined in `metrics`.
//! pub static RUN_COUNT: LazyLock<Result<IntCounter>> = LazyLock::new(|| try_create_int_counter(
//! "runs_total",
//! "Total number of runs"

View File

@@ -14,7 +14,7 @@ eth2 = { workspace = true }
serde_json = { workspace = true }
serde = { workspace = true }
lighthouse_version = { workspace = true }
lighthouse_metrics = { workspace = true }
metrics = { workspace = true }
slog = { workspace = true }
store = { workspace = true }
regex = { workspace = true }

View File

@@ -1,5 +1,5 @@
use super::types::{BeaconProcessMetrics, ValidatorProcessMetrics};
use lighthouse_metrics::{MetricFamily, MetricType};
use metrics::{MetricFamily, MetricType};
use serde_json::json;
use std::collections::HashMap;
use std::path::Path;
@@ -155,7 +155,7 @@ fn get_value(mf: &MetricFamily) -> Option<i64> {
/// Collects all metrics and returns a `serde_json::Value` object with the required metrics
/// from the metrics hashmap.
pub fn gather_metrics(metrics_map: &HashMap<String, JsonMetric>) -> Option<serde_json::Value> {
let metric_families = lighthouse_metrics::gather();
let metric_families = metrics::gather();
let mut res = serde_json::Map::with_capacity(metrics_map.len());
for mf in metric_families.iter() {
let metric_name = mf.get_name();

View File

@@ -6,5 +6,5 @@ edition = { workspace = true }
[dependencies]
types = { workspace = true }
lighthouse_metrics = { workspace = true }
metrics = { workspace = true }
parking_lot = { workspace = true }

View File

@@ -1,5 +1,5 @@
use crate::SlotClock;
pub use lighthouse_metrics::*;
pub use metrics::*;
use std::sync::LazyLock;
use types::{EthSpec, Slot};

View File

@@ -4,11 +4,17 @@ version = "0.1.0"
authors = ["Sigma Prime <contact@sigmaprime.io>"]
edition = { workspace = true }
[features]
default = ["slog"]
slog = ["dep:slog", "dep:sloggers", "dep:logging"]
tracing = ["dep:tracing"]
[dependencies]
async-channel = { workspace = true }
tokio = { workspace = true }
slog = { workspace = true }
tokio = { workspace = true, features = ["rt-multi-thread", "macros"] }
slog = { workspace = true, optional = true }
futures = { workspace = true }
lighthouse_metrics = { workspace = true }
sloggers = { workspace = true }
logging = { workspace = true }
metrics = { workspace = true }
sloggers = { workspace = true, optional = true }
logging = { workspace = true, optional = true }
tracing = { workspace = true, optional = true }

View File

@@ -1,14 +1,20 @@
mod metrics;
#[cfg(not(feature = "tracing"))]
pub mod test_utils;
use futures::channel::mpsc::Sender;
use futures::prelude::*;
use slog::{debug, o, trace};
use std::sync::Weak;
use tokio::runtime::{Handle, Runtime};
pub use tokio::task::JoinHandle;
// Set up logging framework
#[cfg(not(feature = "tracing"))]
use slog::{debug, o};
#[cfg(feature = "tracing")]
use tracing::debug;
/// Provides a reason when Lighthouse is shut down.
#[derive(Copy, Clone, Debug, PartialEq)]
pub enum ShutdownReason {
@@ -79,7 +85,7 @@ pub struct TaskExecutor {
///
/// The task must provide a reason for shutting down.
signal_tx: Sender<ShutdownReason>,
#[cfg(not(feature = "tracing"))]
log: slog::Logger,
}
@@ -94,18 +100,20 @@ impl TaskExecutor {
pub fn new<T: Into<HandleProvider>>(
handle: T,
exit: async_channel::Receiver<()>,
log: slog::Logger,
#[cfg(not(feature = "tracing"))] log: slog::Logger,
signal_tx: Sender<ShutdownReason>,
) -> Self {
Self {
handle_provider: handle.into(),
exit,
signal_tx,
#[cfg(not(feature = "tracing"))]
log,
}
}
/// Clones the task executor adding a service name.
#[cfg(not(feature = "tracing"))]
pub fn clone_with_name(&self, service_name: String) -> Self {
TaskExecutor {
handle_provider: self.handle_provider.clone(),
@@ -115,6 +123,16 @@ impl TaskExecutor {
}
}
/// Clones the task executor adding a service name.
#[cfg(feature = "tracing")]
pub fn clone(&self) -> Self {
TaskExecutor {
handle_provider: self.handle_provider.clone(),
exit: self.exit.clone(),
signal_tx: self.signal_tx.clone(),
}
}
/// A convenience wrapper for `Self::spawn` which ignores a `Result` as long as both `Ok`/`Err`
/// are of type `()`.
///
@@ -150,10 +168,13 @@ impl TaskExecutor {
drop(timer);
});
} else {
#[cfg(not(feature = "tracing"))]
debug!(
self.log,
"Couldn't spawn monitor task. Runtime shutting down"
)
);
#[cfg(feature = "tracing")]
debug!("Couldn't spawn monitor task. Runtime shutting down");
}
}
@@ -175,7 +196,7 @@ impl TaskExecutor {
/// Spawn a future on the tokio runtime. This function does not wrap the task in an `async-channel::Receiver`
/// like [spawn](#method.spawn).
/// The caller of this function is responsible for wrapping up the task with an `async-channel::Receiver` to
/// ensure that the task gets canceled appropriately.
/// ensure that the task gets cancelled appropriately.
/// This function generates prometheus metrics on number of tasks and task duration.
///
/// This is useful in cases where the future to be spawned needs to do additional cleanup work when
@@ -197,7 +218,10 @@ impl TaskExecutor {
if let Some(handle) = self.handle() {
handle.spawn(future);
} else {
#[cfg(not(feature = "tracing"))]
debug!(self.log, "Couldn't spawn task. Runtime shutting down");
#[cfg(feature = "tracing")]
debug!("Couldn't spawn task. Runtime shutting down");
}
}
}
@@ -215,7 +239,7 @@ impl TaskExecutor {
/// Spawn a future on the tokio runtime wrapped in an `async-channel::Receiver` returning an optional
/// join handle to the future.
/// The task is canceled when the corresponding async-channel is dropped.
/// The task is cancelled when the corresponding async-channel is dropped.
///
/// This function generates prometheus metrics on number of tasks and task duration.
pub fn spawn_handle<R: Send + 'static>(
@@ -224,6 +248,8 @@ impl TaskExecutor {
name: &'static str,
) -> Option<tokio::task::JoinHandle<Option<R>>> {
let exit = self.exit();
#[cfg(not(feature = "tracing"))]
let log = self.log.clone();
if let Some(int_gauge) = metrics::get_int_gauge(&metrics::ASYNC_TASKS_COUNT, &[name]) {
@@ -234,12 +260,12 @@ impl TaskExecutor {
Some(handle.spawn(async move {
futures::pin_mut!(exit);
let result = match future::select(Box::pin(task), exit).await {
future::Either::Left((value, _)) => {
trace!(log, "Async task completed"; "task" => name);
Some(value)
}
future::Either::Left((value, _)) => Some(value),
future::Either::Right(_) => {
#[cfg(not(feature = "tracing"))]
debug!(log, "Async task shutdown, exit received"; "task" => name);
#[cfg(feature = "tracing")]
debug!(task = name, "Async task shutdown, exit received");
None
}
};
@@ -247,7 +273,10 @@ impl TaskExecutor {
result
}))
} else {
debug!(self.log, "Couldn't spawn task. Runtime shutting down");
#[cfg(not(feature = "tracing"))]
debug!(log, "Couldn't spawn task. Runtime shutting down");
#[cfg(feature = "tracing")]
debug!("Couldn't spawn task. Runtime shutting down");
None
}
} else {
@@ -270,6 +299,7 @@ impl TaskExecutor {
F: FnOnce() -> R + Send + 'static,
R: Send + 'static,
{
#[cfg(not(feature = "tracing"))]
let log = self.log.clone();
let timer = metrics::start_timer_vec(&metrics::BLOCKING_TASKS_HISTOGRAM, &[name]);
@@ -278,19 +308,22 @@ impl TaskExecutor {
let join_handle = if let Some(handle) = self.handle() {
handle.spawn_blocking(task)
} else {
#[cfg(not(feature = "tracing"))]
debug!(self.log, "Couldn't spawn task. Runtime shutting down");
#[cfg(feature = "tracing")]
debug!("Couldn't spawn task. Runtime shutting down");
return None;
};
let future = async move {
let result = match join_handle.await {
Ok(result) => {
trace!(log, "Blocking task completed"; "task" => name);
Ok(result)
}
Err(e) => {
debug!(log, "Blocking task ended unexpectedly"; "error" => %e);
Err(e)
Ok(result) => Ok(result),
Err(error) => {
#[cfg(not(feature = "tracing"))]
debug!(log, "Blocking task ended unexpectedly"; "error" => %error);
#[cfg(feature = "tracing")]
debug!(%error, "Blocking task ended unexpectedly");
Err(error)
}
};
drop(timer);
@@ -321,32 +354,48 @@ impl TaskExecutor {
) -> Option<F::Output> {
let timer = metrics::start_timer_vec(&metrics::BLOCK_ON_TASKS_HISTOGRAM, &[name]);
metrics::inc_gauge_vec(&metrics::BLOCK_ON_TASKS_COUNT, &[name]);
#[cfg(not(feature = "tracing"))]
let log = self.log.clone();
let handle = self.handle()?;
let exit = self.exit();
#[cfg(not(feature = "tracing"))]
debug!(
log,
"Starting block_on task";
"name" => name
);
#[cfg(feature = "tracing")]
debug!(name, "Starting block_on task");
handle.block_on(async {
let output = tokio::select! {
output = future => {
#[cfg(not(feature = "tracing"))]
debug!(
log,
"Completed block_on task";
"name" => name
);
#[cfg(feature = "tracing")]
debug!(
name,
"Completed block_on task"
);
Some(output)
},
_ = exit => {
#[cfg(not(feature = "tracing"))]
debug!(
log,
"Cancelled block_on task";
"name" => name,
);
#[cfg(feature = "tracing")]
debug!(
name,
"Cancelled block_on task"
);
None
}
};
@@ -376,6 +425,7 @@ impl TaskExecutor {
}
/// Returns a reference to the logger.
#[cfg(not(feature = "tracing"))]
pub fn log(&self) -> &slog::Logger {
&self.log
}

View File

@@ -1,5 +1,5 @@
/// Handles async task metrics
pub use lighthouse_metrics::*;
pub use metrics::*;
use std::sync::LazyLock;
pub static ASYNC_TASKS_COUNT: LazyLock<Result<IntGaugeVec>> = LazyLock::new(|| {

View File

@@ -17,6 +17,6 @@ serde = { workspace = true }
serde_json = { workspace = true }
tokio = { workspace = true }
headers = "0.3.2"
lighthouse_metrics = { workspace = true }
metrics = { workspace = true }
serde_array_query = "0.1.0"
bytes = { workspace = true }

View File

@@ -1,5 +1,5 @@
use eth2::lighthouse::{ProcessHealth, SystemHealth};
use lighthouse_metrics::*;
use metrics::*;
use std::sync::LazyLock;
pub static PROCESS_NUM_THREADS: LazyLock<Result<IntGauge>> = LazyLock::new(|| {