Monitoring service api (#2251)

## Issue Addressed

N/A

## Proposed Changes

Adds a client side api for collecting system and process metrics and pushing it to a monitoring service.
This commit is contained in:
Pawan Dhananjay
2021-05-26 05:58:41 +00:00
parent 55aada006f
commit fdaeec631b
30 changed files with 1108 additions and 65 deletions

View File

@@ -0,0 +1,24 @@
[package]
name = "monitoring_api"
version = "0.1.0"
authors = ["pawan <pawandhananjay@gmail.com>"]
edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
reqwest = { version = "0.11.0", features = ["json","stream"] }
futures = "0.3.7"
task_executor = { path = "../task_executor" }
tokio = "1.1.0"
eth2 = {path = "../eth2"}
serde_json = "1.0.58"
serde = "1.0.116"
serde_derive = "1.0.116"
lighthouse_version = { path = "../lighthouse_version"}
lighthouse_metrics = { path = "../lighthouse_metrics" }
slog = "2.5.2"
store = { path = "../../beacon_node/store" }
lazy_static = "1.4.0"
regex = "1"
sensitive_url = { path = "../sensitive_url" }

View File

@@ -0,0 +1,193 @@
use super::types::{BeaconProcessMetrics, ValidatorProcessMetrics};
use lazy_static::lazy_static;
use lighthouse_metrics::{MetricFamily, MetricType};
use serde_json::json;
use std::collections::HashMap;
use std::path::Path;
/// Represents a metric that needs to be fetched from lighthouse metrics registry
/// and sent to the remote monitoring service.
#[derive(Debug, Clone)]
pub struct JsonMetric {
/// Name of the metric as used in Lighthouse metrics.
lighthouse_metric_name: &'static str,
/// Json key for the metric that we send to the remote monitoring endpoint.
json_output_key: &'static str,
/// Type of the json value to be sent to the remote monitoring endpoint
ty: JsonType,
}
impl JsonMetric {
const fn new(
lighthouse_metric_name: &'static str,
json_output_key: &'static str,
ty: JsonType,
) -> Self {
Self {
lighthouse_metric_name,
json_output_key,
ty,
}
}
/// Return a json value given given the metric type.
fn get_typed_value(&self, value: i64) -> serde_json::Value {
match self.ty {
JsonType::Integer => json!(value),
JsonType::Boolean => {
if value > 0 {
json!(true)
} else {
json!(false)
}
}
}
}
}
/// The required metrics for the beacon and validator processes.
const BEACON_PROCESS_METRICS: &[JsonMetric] = &[
JsonMetric::new(
"sync_eth1_fallback_configured",
"sync_eth1_fallback_configured",
JsonType::Boolean,
),
JsonMetric::new(
"sync_eth1_fallback_connected",
"sync_eth1_fallback_connected",
JsonType::Boolean,
),
JsonMetric::new(
"sync_eth1_connected",
"sync_eth1_connected",
JsonType::Boolean,
),
JsonMetric::new(
"store_disk_db_size",
"disk_beaconchain_bytes_total",
JsonType::Integer,
),
JsonMetric::new(
"libp2p_peer_connected_peers_total",
"network_peers_connected",
JsonType::Integer,
),
JsonMetric::new(
"libp2p_outbound_bytes",
"network_libp2p_bytes_total_transmit",
JsonType::Integer,
),
JsonMetric::new(
"libp2p_inbound_bytes",
"network_libp2p_bytes_total_receive",
JsonType::Integer,
),
JsonMetric::new(
"notifier_head_slot",
"sync_beacon_head_slot",
JsonType::Integer,
),
JsonMetric::new("sync_eth2_synced", "sync_eth2_synced", JsonType::Boolean),
];
const VALIDATOR_PROCESS_METRICS: &[JsonMetric] = &[
JsonMetric::new(
"vc_validators_enabled_count",
"validator_active",
JsonType::Integer,
),
JsonMetric::new(
"vc_validators_total_count",
"validator_total",
JsonType::Integer,
),
JsonMetric::new(
"sync_eth2_fallback_configured",
"sync_eth2_fallback_configured",
JsonType::Boolean,
),
JsonMetric::new(
"sync_eth2_fallback_connected",
"sync_eth2_fallback_connected",
JsonType::Boolean,
),
];
/// Represents the type for the JSON output.
#[derive(Debug, Clone)]
pub enum JsonType {
Integer,
Boolean,
}
lazy_static! {
/// HashMap representing the `BEACON_PROCESS_METRICS`.
pub static ref BEACON_METRICS_MAP: HashMap<String, JsonMetric> = BEACON_PROCESS_METRICS
.iter()
.map(|metric| (metric.lighthouse_metric_name.to_string(), metric.clone()))
.collect();
/// HashMap representing the `VALIDATOR_PROCESS_METRICS`.
pub static ref VALIDATOR_METRICS_MAP: HashMap<String,JsonMetric> =
VALIDATOR_PROCESS_METRICS
.iter()
.map(|metric| (metric.lighthouse_metric_name.to_string(), metric.clone()))
.collect();
}
/// Returns the value from a Counter/Gauge `MetricType` assuming that it has no associated labels
/// else it returns `None`.
fn get_value(mf: &MetricFamily) -> Option<i64> {
let metric = mf.get_metric().first()?;
match mf.get_field_type() {
MetricType::COUNTER => Some(metric.get_counter().get_value() as i64),
MetricType::GAUGE => Some(metric.get_gauge().get_value() as i64),
_ => None,
}
}
/// Collects all metrics and returns a `serde_json::Value` object with the required metrics
/// from the metrics hashmap.
pub fn gather_metrics(metrics_map: &HashMap<String, JsonMetric>) -> Option<serde_json::Value> {
let metric_families = lighthouse_metrics::gather();
let mut res = serde_json::Map::with_capacity(metrics_map.len());
for mf in metric_families.iter() {
let metric_name = mf.get_name();
if metrics_map.contains_key(metric_name) {
let value = get_value(&mf).unwrap_or_default();
let metric = metrics_map.get(metric_name)?;
let value = metric.get_typed_value(value);
let _ = res.insert(metric.json_output_key.to_string(), value);
};
}
Some(serde_json::Value::Object(res))
}
/// Gathers and returns the lighthouse beacon metrics.
pub fn gather_beacon_metrics(
db_path: &Path,
freezer_db_path: &Path,
) -> Result<BeaconProcessMetrics, String> {
// Update db size metrics
store::metrics::scrape_for_metrics(db_path, freezer_db_path);
let beacon_metrics = gather_metrics(&BEACON_METRICS_MAP)
.ok_or_else(|| "Failed to gather beacon metrics".to_string())?;
let process = eth2::lighthouse::ProcessHealth::observe()?.into();
Ok(BeaconProcessMetrics {
beacon: beacon_metrics,
common: process,
})
}
/// Gathers and returns the lighthouse validator metrics.
pub fn gather_validator_metrics() -> Result<ValidatorProcessMetrics, String> {
let validator_metrics = gather_metrics(&VALIDATOR_METRICS_MAP)
.ok_or_else(|| "Failed to gather validator metrics".to_string())?;
let process = eth2::lighthouse::ProcessHealth::observe()?.into();
Ok(ValidatorProcessMetrics {
validator: validator_metrics,
common: process,
})
}

View File

@@ -0,0 +1,208 @@
mod gather;
mod types;
use std::{path::PathBuf, time::Duration};
use eth2::lighthouse::SystemHealth;
use gather::{gather_beacon_metrics, gather_validator_metrics};
use reqwest::{IntoUrl, Response};
pub use reqwest::{StatusCode, Url};
use sensitive_url::SensitiveUrl;
use serde::{Deserialize, Serialize};
use slog::{debug, error, info};
use task_executor::TaskExecutor;
use tokio::time::{interval_at, Instant};
use types::*;
pub use types::ProcessType;
/// Duration after which we collect and send metrics to remote endpoint.
pub const UPDATE_DURATION: u64 = 60;
/// Timeout for HTTP requests.
pub const TIMEOUT_DURATION: u64 = 5;
#[derive(Debug)]
pub enum Error {
/// The `reqwest` client raised an error.
Reqwest(reqwest::Error),
/// The supplied URL is badly formatted. It should look something like `http://127.0.0.1:5052`.
InvalidUrl(SensitiveUrl),
SystemMetricsFailed(String),
BeaconMetricsFailed(String),
ValidatorMetricsFailed(String),
/// The server returned an error message where the body was able to be parsed.
ServerMessage(ErrorMessage),
/// The server returned an error message where the body was unable to be parsed.
StatusCode(StatusCode),
}
impl std::fmt::Display for Error {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match &self {
Error::Reqwest(e) => write!(f, "Reqwest error: {}", e),
// Print the debug value
e => write!(f, "{:?}", e),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct Config {
/// Endpoint
pub monitoring_endpoint: String,
/// Path for the hot database required for fetching beacon db size metrics.
/// Note: not relevant for validator and system metrics.
pub db_path: Option<PathBuf>,
/// Path for the cold database required for fetching beacon db size metrics.
/// Note: not relevant for validator and system metrics.
pub freezer_db_path: Option<PathBuf>,
}
#[derive(Clone)]
pub struct MonitoringHttpClient {
client: reqwest::Client,
/// Path to the hot database. Required for getting db size metrics
db_path: Option<PathBuf>,
/// Path to the freezer database.
freezer_db_path: Option<PathBuf>,
monitoring_endpoint: SensitiveUrl,
log: slog::Logger,
}
impl MonitoringHttpClient {
pub fn new(config: &Config, log: slog::Logger) -> Result<Self, String> {
Ok(Self {
client: reqwest::Client::new(),
db_path: config.db_path.clone(),
freezer_db_path: config.freezer_db_path.clone(),
monitoring_endpoint: SensitiveUrl::parse(&config.monitoring_endpoint)
.map_err(|e| format!("Invalid monitoring endpoint: {:?}", e))?,
log,
})
}
/// Perform a HTTP POST request.
async fn post<T: Serialize, U: IntoUrl>(&self, url: U, body: &T) -> Result<(), Error> {
let response = self
.client
.post(url)
.json(body)
.timeout(Duration::from_secs(TIMEOUT_DURATION))
.send()
.await
.map_err(Error::Reqwest)?;
ok_or_error(response).await?;
Ok(())
}
/// Creates a task which periodically sends the provided process metrics
/// to the configured remote endpoint.
pub fn auto_update(self, executor: TaskExecutor, processes: Vec<ProcessType>) {
let mut interval = interval_at(
// Have some initial delay for the metrics to get initialized
Instant::now() + Duration::from_secs(25),
Duration::from_secs(UPDATE_DURATION),
);
info!(self.log, "Starting monitoring api"; "endpoint" => %self.monitoring_endpoint);
let update_future = async move {
loop {
interval.tick().await;
match self.send_metrics(&processes).await {
Ok(()) => {
debug!(self.log, "Metrics sent to remote server"; "endpoint" => %self.monitoring_endpoint);
}
Err(e) => {
error!(self.log, "Failed to send metrics to remote endpoint"; "error" => %e)
}
}
}
};
executor.spawn(update_future, "monitoring_api");
}
/// Gets beacon metrics and updates the metrics struct
pub fn get_beacon_metrics(&self) -> Result<MonitoringMetrics, Error> {
let db_path = self.db_path.as_ref().ok_or_else(|| {
Error::BeaconMetricsFailed("Beacon metrics require db path".to_string())
})?;
let freezer_db_path = self.db_path.as_ref().ok_or_else(|| {
Error::BeaconMetricsFailed("Beacon metrics require freezer db path".to_string())
})?;
let metrics = gather_beacon_metrics(&db_path, &freezer_db_path)
.map_err(Error::BeaconMetricsFailed)?;
Ok(MonitoringMetrics {
metadata: Metadata::new(ProcessType::BeaconNode),
process_metrics: Process::Beacon(metrics),
})
}
/// Gets validator process metrics by querying the validator metrics endpoint
pub fn get_validator_metrics(&self) -> Result<MonitoringMetrics, Error> {
let metrics = gather_validator_metrics().map_err(Error::BeaconMetricsFailed)?;
Ok(MonitoringMetrics {
metadata: Metadata::new(ProcessType::Validator),
process_metrics: Process::Validator(metrics),
})
}
/// Gets system metrics by observing capturing the SystemHealth metrics.
pub fn get_system_metrics(&self) -> Result<MonitoringMetrics, Error> {
let system_health = SystemHealth::observe().map_err(Error::SystemMetricsFailed)?;
Ok(MonitoringMetrics {
metadata: Metadata::new(ProcessType::System),
process_metrics: Process::System(system_health.into()),
})
}
/// Return metric based on process type.
pub async fn get_metrics(
&self,
process_type: &ProcessType,
) -> Result<MonitoringMetrics, Error> {
match process_type {
ProcessType::BeaconNode => self.get_beacon_metrics(),
ProcessType::System => self.get_system_metrics(),
ProcessType::Validator => self.get_validator_metrics(),
}
}
/// Send metrics to the remote endpoint
pub async fn send_metrics(&self, processes: &[ProcessType]) -> Result<(), Error> {
let mut metrics = Vec::new();
for process in processes {
match self.get_metrics(process).await {
Err(e) => error!(
self.log,
"Failed to get metrics";
"process_type" => ?process,
"error" => %e
),
Ok(metric) => metrics.push(metric),
}
}
info!(
self.log,
"Sending metrics to remote endpoint";
"endpoint" => %self.monitoring_endpoint
);
self.post(self.monitoring_endpoint.full.clone(), &metrics)
.await
}
}
/// Returns `Ok(response)` if the response is a `200 OK` response. Otherwise, creates an
/// appropriate error message.
async fn ok_or_error(response: Response) -> Result<Response, Error> {
let status = response.status();
if status == StatusCode::OK {
Ok(response)
} else if let Ok(message) = response.json().await {
Err(Error::ServerMessage(message))
} else {
Err(Error::StatusCode(status))
}
}

View File

@@ -0,0 +1,177 @@
use std::time::{SystemTime, UNIX_EPOCH};
use eth2::lighthouse::{ProcessHealth, SystemHealth};
use serde_derive::{Deserialize, Serialize};
pub const VERSION: u64 = 1;
pub const CLIENT_NAME: &str = "lighthouse";
/// An API error serializable to JSON.
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ErrorMessage {
pub code: u16,
pub message: String,
#[serde(default)]
pub stacktraces: Vec<String>,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct MonitoringMetrics {
#[serde(flatten)]
pub metadata: Metadata,
#[serde(flatten)]
pub process_metrics: Process,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum ProcessType {
BeaconNode,
Validator,
System,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Metadata {
version: u64,
timestamp: u128,
process: ProcessType,
}
impl Metadata {
pub fn new(process: ProcessType) -> Self {
Self {
version: VERSION,
timestamp: SystemTime::now()
.duration_since(UNIX_EPOCH)
.expect("time should be greater than unix epoch")
.as_millis(),
process,
}
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(untagged)]
pub enum Process {
Beacon(BeaconProcessMetrics),
System(SystemMetrics),
Validator(ValidatorProcessMetrics),
}
/// Common metrics for all processes.
#[derive(Debug, Default, Clone, PartialEq, Serialize, Deserialize)]
pub struct ProcessMetrics {
cpu_process_seconds_total: u64,
memory_process_bytes: u64,
client_name: String,
client_version: String,
client_build: u64,
}
impl From<ProcessHealth> for ProcessMetrics {
fn from(health: ProcessHealth) -> Self {
Self {
cpu_process_seconds_total: health.pid_process_seconds_total,
memory_process_bytes: health.pid_mem_resident_set_size,
client_name: CLIENT_NAME.to_string(),
client_version: client_version().unwrap_or_default(),
client_build: client_build(),
}
}
}
/// Metrics related to the system.
#[derive(Debug, Default, Clone, PartialEq, Serialize, Deserialize)]
pub struct SystemMetrics {
cpu_cores: u64,
cpu_threads: u64,
cpu_node_system_seconds_total: u64,
cpu_node_user_seconds_total: u64,
cpu_node_iowait_seconds_total: u64,
cpu_node_idle_seconds_total: u64,
memory_node_bytes_total: u64,
memory_node_bytes_free: u64,
memory_node_bytes_cached: u64,
memory_node_bytes_buffers: u64,
disk_node_bytes_total: u64,
disk_node_bytes_free: u64,
disk_node_io_seconds: u64,
disk_node_reads_total: u64,
disk_node_writes_total: u64,
network_node_bytes_total_receive: u64,
network_node_bytes_total_transmit: u64,
misc_node_boot_ts_seconds: u64,
misc_os: String,
}
impl From<SystemHealth> for SystemMetrics {
fn from(health: SystemHealth) -> Self {
// Export format uses 3 letter os names
let misc_os = health.misc_os.get(0..3).unwrap_or("unk").to_string();
Self {
cpu_cores: health.cpu_cores,
cpu_threads: health.cpu_threads,
cpu_node_system_seconds_total: health.cpu_time_total,
cpu_node_user_seconds_total: health.user_seconds_total,
cpu_node_iowait_seconds_total: health.iowait_seconds_total,
cpu_node_idle_seconds_total: health.idle_seconds_total,
memory_node_bytes_total: health.sys_virt_mem_total,
memory_node_bytes_free: health.sys_virt_mem_free,
memory_node_bytes_cached: health.sys_virt_mem_cached,
memory_node_bytes_buffers: health.sys_virt_mem_buffers,
disk_node_bytes_total: health.disk_node_bytes_total,
disk_node_bytes_free: health.disk_node_bytes_free,
// Unavaliable for now
disk_node_io_seconds: 0,
disk_node_reads_total: health.disk_node_reads_total,
disk_node_writes_total: health.disk_node_writes_total,
network_node_bytes_total_receive: health.network_node_bytes_total_received,
network_node_bytes_total_transmit: health.network_node_bytes_total_transmit,
misc_node_boot_ts_seconds: health.misc_node_boot_ts_seconds,
misc_os,
}
}
}
/// All beacon process metrics.
#[derive(Debug, Default, Clone, PartialEq, Serialize, Deserialize)]
pub struct BeaconProcessMetrics {
#[serde(flatten)]
pub common: ProcessMetrics,
#[serde(flatten)]
pub beacon: serde_json::Value,
}
/// All validator process metrics
#[derive(Debug, Default, Clone, PartialEq, Serialize, Deserialize)]
pub struct ValidatorProcessMetrics {
#[serde(flatten)]
pub common: ProcessMetrics,
#[serde(flatten)]
pub validator: serde_json::Value,
}
/// Returns the client version
fn client_version() -> Option<String> {
let re = regex::Regex::new(r"\d+\.\d+\.\d+").expect("Regex is valid");
re.find(lighthouse_version::VERSION)
.map(|m| m.as_str().to_string())
}
/// Returns the client build
/// Note: Lighthouse does not support build numbers, this is effectively a null-value.
fn client_build() -> u64 {
0
}